Commit 793333a3 authored by Ilya Dryomov's avatar Ilya Dryomov

rbd: introduce copyup state machine

Both write and copyup paths will get more complex with object map.
Factor copyup code out into a separate state machine.

While at it, take advantage of obj_req->osd_reqs list and issue empty
and current snapc OSD requests together, one after another.
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
Reviewed-by: default avatarDongsheng Yang <dongsheng.yang@easystack.cn>
parent ea9b743c
...@@ -226,6 +226,7 @@ enum obj_operation_type { ...@@ -226,6 +226,7 @@ enum obj_operation_type {
#define RBD_OBJ_FLAG_DELETION (1U << 0) #define RBD_OBJ_FLAG_DELETION (1U << 0)
#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1) #define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
enum rbd_obj_read_state { enum rbd_obj_read_state {
RBD_OBJ_READ_START = 1, RBD_OBJ_READ_START = 1,
...@@ -261,9 +262,15 @@ enum rbd_obj_read_state { ...@@ -261,9 +262,15 @@ enum rbd_obj_read_state {
enum rbd_obj_write_state { enum rbd_obj_write_state {
RBD_OBJ_WRITE_START = 1, RBD_OBJ_WRITE_START = 1,
RBD_OBJ_WRITE_OBJECT, RBD_OBJ_WRITE_OBJECT,
RBD_OBJ_WRITE_READ_FROM_PARENT, __RBD_OBJ_WRITE_COPYUP,
RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC, RBD_OBJ_WRITE_COPYUP,
RBD_OBJ_WRITE_COPYUP_OPS, };
enum rbd_obj_copyup_state {
RBD_OBJ_COPYUP_START = 1,
RBD_OBJ_COPYUP_READ_PARENT,
__RBD_OBJ_COPYUP_WRITE_OBJECT,
RBD_OBJ_COPYUP_WRITE_OBJECT,
}; };
struct rbd_obj_request { struct rbd_obj_request {
...@@ -286,12 +293,15 @@ struct rbd_obj_request { ...@@ -286,12 +293,15 @@ struct rbd_obj_request {
u32 bvec_idx; u32 bvec_idx;
}; };
}; };
enum rbd_obj_copyup_state copyup_state;
struct bio_vec *copyup_bvecs; struct bio_vec *copyup_bvecs;
u32 copyup_bvec_count; u32 copyup_bvec_count;
struct list_head osd_reqs; /* w/ r_private_item */ struct list_head osd_reqs; /* w/ r_private_item */
struct mutex state_mutex; struct mutex state_mutex;
struct pending_result pending;
struct kref kref; struct kref kref;
}; };
...@@ -2568,8 +2578,8 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) ...@@ -2568,8 +2578,8 @@ static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
#define MODS_ONLY U32_MAX #define MODS_ONLY U32_MAX
static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
u32 bytes) u32 bytes)
{ {
struct ceph_osd_request *osd_req; struct ceph_osd_request *osd_req;
int ret; int ret;
...@@ -2595,7 +2605,8 @@ static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req, ...@@ -2595,7 +2605,8 @@ static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
return 0; return 0;
} }
static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
u32 bytes)
{ {
struct ceph_osd_request *osd_req; struct ceph_osd_request *osd_req;
int num_ops = count_write_ops(obj_req); int num_ops = count_write_ops(obj_req);
...@@ -2628,33 +2639,6 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes) ...@@ -2628,33 +2639,6 @@ static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
return 0; return 0;
} }
static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
{
/*
* Only send non-zero copyup data to save some I/O and network
* bandwidth -- zero copyup data is equivalent to the object not
* existing.
*/
if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
bytes = 0;
}
if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
/*
* Send a copyup request with an empty snapshot context to
* deep-copyup the object through all existing snapshots.
* A second request with the current snapshot context will be
* sent for the actual modification.
*/
obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
}
obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
return rbd_obj_issue_copyup_ops(obj_req, bytes);
}
static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
{ {
u32 i; u32 i;
...@@ -2688,7 +2672,7 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) ...@@ -2688,7 +2672,7 @@ static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
* target object up to the overlap point (if any) from the parent, * target object up to the overlap point (if any) from the parent,
* so we can use it for a copyup. * so we can use it for a copyup.
*/ */
static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
{ {
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
int ret; int ret;
...@@ -2703,22 +2687,111 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) ...@@ -2703,22 +2687,111 @@ static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
* request -- pass MODS_ONLY since the copyup isn't needed * request -- pass MODS_ONLY since the copyup isn't needed
* anymore. * anymore.
*/ */
obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS; return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
} }
ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
if (ret) if (ret)
return ret; return ret;
obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
return rbd_obj_read_from_parent(obj_req); return rbd_obj_read_from_parent(obj_req);
} }
static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
{
u32 bytes = rbd_obj_img_extents_bytes(obj_req);
int ret;
rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
/*
* Only send non-zero copyup data to save some I/O and network
* bandwidth -- zero copyup data is equivalent to the object not
* existing.
*/
if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
bytes = 0;
if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
/*
* Send a copyup request with an empty snapshot context to
* deep-copyup the object through all existing snapshots.
* A second request with the current snapshot context will be
* sent for the actual modification.
*/
ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
if (ret) {
obj_req->pending.result = ret;
return;
}
obj_req->pending.num_pending++;
bytes = MODS_ONLY;
}
ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
if (ret) {
obj_req->pending.result = ret;
return;
}
obj_req->pending.num_pending++;
}
static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
{
int ret;
again:
switch (obj_req->copyup_state) {
case RBD_OBJ_COPYUP_START:
rbd_assert(!*result);
ret = rbd_obj_copyup_read_parent(obj_req);
if (ret) {
*result = ret;
return true;
}
if (obj_req->num_img_extents)
obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
else
obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
return false;
case RBD_OBJ_COPYUP_READ_PARENT:
if (*result)
return true;
if (is_zero_bvecs(obj_req->copyup_bvecs,
rbd_obj_img_extents_bytes(obj_req))) {
dout("%s %p detected zeros\n", __func__, obj_req);
obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
}
rbd_obj_copyup_write_object(obj_req);
if (!obj_req->pending.num_pending) {
*result = obj_req->pending.result;
obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
goto again;
}
obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
return false;
case __RBD_OBJ_COPYUP_WRITE_OBJECT:
if (!pending_result_dec(&obj_req->pending, result))
return false;
/* fall through */
case RBD_OBJ_COPYUP_WRITE_OBJECT:
return true;
default:
BUG();
}
}
static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
{ {
struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
int ret; int ret;
again:
switch (obj_req->write_state) { switch (obj_req->write_state) {
case RBD_OBJ_WRITE_START: case RBD_OBJ_WRITE_START:
rbd_assert(!*result); rbd_assert(!*result);
...@@ -2733,12 +2806,10 @@ static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) ...@@ -2733,12 +2806,10 @@ static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
case RBD_OBJ_WRITE_OBJECT: case RBD_OBJ_WRITE_OBJECT:
if (*result == -ENOENT) { if (*result == -ENOENT) {
if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) { if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
ret = rbd_obj_handle_write_guard(obj_req); *result = 0;
if (ret) { obj_req->copyup_state = RBD_OBJ_COPYUP_START;
*result = ret; obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
return true; goto again;
}
return false;
} }
/* /*
* On a non-existent object: * On a non-existent object:
...@@ -2747,31 +2818,19 @@ static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result) ...@@ -2747,31 +2818,19 @@ static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
if (obj_req->flags & RBD_OBJ_FLAG_DELETION) if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
*result = 0; *result = 0;
} }
/* fall through */
case RBD_OBJ_WRITE_COPYUP_OPS:
return true;
case RBD_OBJ_WRITE_READ_FROM_PARENT:
if (*result) if (*result)
return true; return true;
ret = rbd_obj_issue_copyup(obj_req, obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
rbd_obj_img_extents_bytes(obj_req)); goto again;
if (ret) { case __RBD_OBJ_WRITE_COPYUP:
*result = ret; if (!rbd_obj_advance_copyup(obj_req, result))
return true; return false;
} /* fall through */
return false; case RBD_OBJ_WRITE_COPYUP:
case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
if (*result) if (*result)
return true; rbd_warn(rbd_dev, "copyup failed: %d", *result);
return true;
obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
if (ret) {
*result = ret;
return true;
}
return false;
default: default:
BUG(); BUG();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment