drbd: better separate WRITE and READ code paths in drbd_make_request

cherry-picked and adapted from drbd 9 devel branch READs will be interesting to at most one connection, WRITEs should be interesting for all established connections. Introduce some helper functions to hopefully make this easier to follow. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

drbd: better separate WRITE and READ code paths in drbd_make_request
cherry-picked and adapted from drbd 9 devel branch READs will be interesting to at most one connection, WRITEs should be interesting for all established connections. Introduce some helper functions to hopefully make this easier to follow. Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com> Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
5da9c836 · Lars Ellenberg · Philipp Reisner · b6dd1a89 · 5da9c836
Commit 5da9c836 authored Mar 29, 2012 by Lars Ellenberg Committed by Philipp Reisner Nov 08, 2012
Hide whitespace changes
Inline Side-by-side

Showing with 211 additions and 188 deletions

drivers/block/drbd/drbd_req.c drivers/block/drbd/drbd_req.c +211 -188

No files found.
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -304,15 +304,21 @@ void req_may_be_completed(struct drbd_request *req, struct bio_and_error *m)
 		/* Update disk stats */
 		_drbd_end_io_acct(mdev, req);

-		/* if READ failed,
+		/* If READ failed,
 		 * have it be pushed back to the retry work queue,
-		 * so it will re-enter __drbd_make_request,
+		 * so it will re-enter __drbd_make_request(),
 		 * and be re-assigned to a suitable local or remote path,
 		 * or failed if we do not have access to good data anymore.
-		 * READA may fail.
+		 *
+		 * Unless it was failed early by __drbd_make_request(),
+		 * because no path was available, in which case
+		 * it was not even added to the transfer_log.
+		 *
+		 * READA may fail, and will not be retried.
+		 *
 		 * WRITE should have used all available paths already.
 		 */
-		if (!ok && rw == READ)
+		if (!ok && rw == READ && !list_empty(&req->tl_requests))
 			req->rq_state |= RQ_POSTPONED;

 		if (!(req->rq_state & RQ_POSTPONED)) {
@@ -725,19 +731,12 @@ static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int
 	return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
 }

-static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector)
+static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
+		enum drbd_read_balancing rbm)
 {
-	enum drbd_read_balancing rbm;
 	struct backing_dev_info *bdi;
 	int stripe_shift;

-	if (mdev->state.pdsk < D_UP_TO_DATE)
-		return false;
-
-	rcu_read_lock();
-	rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
-	rcu_read_unlock();
-
 	switch (rbm) {
 	case RB_CONGESTED_REMOTE:
 		bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
@@ -798,17 +797,160 @@ static void complete_conflicting_writes(struct drbd_request *req)
 	finish_wait(&mdev->misc_wait, &wait);
 }

+/* called within req_lock and rcu_read_lock() */
+static bool conn_check_congested(struct drbd_conf *mdev)
+{
+	struct drbd_tconn *tconn = mdev->tconn;
+	struct net_conf *nc;
+	bool congested = false;
+	enum drbd_on_congestion on_congestion;
+
+	nc = rcu_dereference(tconn->net_conf);
+	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+	if (on_congestion == OC_BLOCK ||
+	    tconn->agreed_pro_version < 96)
+		return false;
+
+	if (nc->cong_fill &&
+	    atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
+		dev_info(DEV, "Congestion-fill threshold reached\n");
+		congested = true;
+	}
+
+	if (mdev->act_log->used >= nc->cong_extents) {
+		dev_info(DEV, "Congestion-extents threshold reached\n");
+		congested = true;
+	}
+
+	if (congested) {
+		if (mdev->tconn->current_tle_writes)
+			/* start a new epoch for non-mirrored writes */
+			start_new_tl_epoch(mdev->tconn);
+
+		if (on_congestion == OC_PULL_AHEAD)
+			_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
+		else  /*nc->on_congestion == OC_DISCONNECT */
+			_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
+	}
+
+	return congested;
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	enum drbd_read_balancing rbm;
+
+	if (req->private_bio) {
+		if (!drbd_may_do_local_read(mdev,
+					req->i.sector, req->i.size)) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(mdev);
+		}
+	}
+
+	if (mdev->state.pdsk != D_UP_TO_DATE)
+		return false;
+
+	/* TODO: improve read balancing decisions, take into account drbd
+	 * protocol, pending requests etc. */
+
+	rcu_read_lock();
+	rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
+	rcu_read_unlock();
+
+	if (rbm == RB_PREFER_LOCAL && req->private_bio)
+		return false; /* submit locally */
+
+	if (req->private_bio == NULL)
+		return true;
+
+	if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(mdev);
+		}
+		return true;
+	}
+
+	return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	int remote, send_oos;
+
+	rcu_read_lock();
+	remote = drbd_should_do_remote(mdev->state);
+	if (remote) {
+		conn_check_congested(mdev);
+		remote = drbd_should_do_remote(mdev->state);
+	}
+	send_oos = drbd_should_send_out_of_sync(mdev->state);
+	rcu_read_unlock();
+
+	if (!remote && !send_oos)
+		return 0;
+
+	D_ASSERT(!(remote && send_oos));
+
+	if (remote) {
+		_req_mod(req, TO_BE_SENT);
+		_req_mod(req, QUEUE_FOR_NET_WRITE);
+	} else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
+		_req_mod(req, QUEUE_FOR_SEND_OOS);
+
+	return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+	struct drbd_conf *mdev = req->w.mdev;
+	struct bio *bio = req->private_bio;
+	const int rw = bio_rw(bio);
+
+	bio->bi_bdev = mdev->ldev->backing_bdev;
+
+	/* State may have changed since we grabbed our reference on the
+	 * ->ldev member. Double check, and short-circuit to endio.
+	 * In case the last activity log transaction failed to get on
+	 * stable storage, and this is a WRITE, we may not even submit
+	 * this bio. */
+	if (get_ldev(mdev)) {
+		if (drbd_insert_fault(mdev,
+				      rw == WRITE ? DRBD_FAULT_DT_WR
+				    : rw == READ  ? DRBD_FAULT_DT_RD
+				    :               DRBD_FAULT_DT_RA))
+			bio_endio(bio, -EIO);
+		else
+			generic_make_request(bio);
+		put_ldev(mdev);
+	} else
+		bio_endio(bio, -EIO);
+}
+
 int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
 {
 	const int rw = bio_rw(bio);
-	const int size = bio->bi_size;
-	const sector_t sector = bio->bi_sector;
+	struct bio_and_error m = { NULL, };
 	struct drbd_request *req;
-	struct net_conf *nc;
-	int local, remote, send_oos = 0;
-	int err = 0;
-	int ret = 0;
-	union drbd_dev_state s;
+	bool no_remote = false;

 	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
@@ -822,70 +964,23 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
 	}
 	req->start_time = start_time;

-	local = get_ldev(mdev);
-	if (!local) {
-		bio_put(req->private_bio); /* or we get a bio leak */
+	if (!get_ldev(mdev)) {
+		bio_put(req->private_bio);
 		req->private_bio = NULL;
 	}
-	if (rw == WRITE) {
-		remote = 1;
-	} else {
-		/* READ || READA */
-		if (local) {
-			if (!drbd_may_do_local_read(mdev, sector, size) ||
-			    remote_due_to_read_balancing(mdev, sector)) {
-				/* we could kick the syncer to
-				 * sync this extent asap, wait for
-				 * it, then continue locally.
-				 * Or just issue the request remotely.
-				 */
-				local = 0;
-				bio_put(req->private_bio);
-				req->private_bio = NULL;
-				put_ldev(mdev);
-			}
-		}
-		remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
-	}
-
-	/* If we have a disk, but a READA request is mapped to remote,
-	 * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
-	 * Just fail that READA request right here.
-	 *
-	 * THINK: maybe fail all READA when not local?
-	 *        or make this configurable...
-	 *        if network is slow, READA won't do any good.
-	 */
-	if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
-		err = -EWOULDBLOCK;
-		goto fail_and_free_req;
-	}

 	/* For WRITES going to the local disk, grab a reference on the target
 	 * extent.  This waits for any resync activity in the corresponding
 	 * resync extent to finish, and, if necessary, pulls in the target
 	 * extent into the activity log, which involves further disk io because
 	 * of transactional on-disk meta data updates. */
-	if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+	if (rw == WRITE && req->private_bio
+	&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
 		req->rq_state |= RQ_IN_ACT_LOG;
 		drbd_al_begin_io(mdev, &req->i);
 	}

-	s = mdev->state;
-	remote = remote && drbd_should_do_remote(s);
-	send_oos = rw == WRITE && drbd_should_send_out_of_sync(s);
-	D_ASSERT(!(remote && send_oos));
-
-	if (!(local || remote) && !drbd_suspended(mdev)) {
-		if (__ratelimit(&drbd_ratelimit_state))
-			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-		err = -EIO;
-		goto fail_free_complete;
-	}
-
-	/* GOOD, everything prepared, grab the spin_lock */
 	spin_lock_irq(&mdev->tconn->req_lock);
-
 	if (rw == WRITE) {
 		/* This may temporarily give up the req_lock,
 		 * but will re-aquire it before it returns here.
@@ -893,53 +988,28 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s
 		complete_conflicting_writes(req);
 	}

-	if (drbd_suspended(mdev)) {
-		/* If we got suspended, use the retry mechanism in
-		   drbd_make_request() to restart processing of this
-		   bio. In the next call to drbd_make_request
-		   we sleep in inc_ap_bio() */
-		ret = 1;
-		spin_unlock_irq(&mdev->tconn->req_lock);
-		goto fail_free_complete;
-	}
-
-	if (remote || send_oos) {
-		remote = drbd_should_do_remote(mdev->state);
-		send_oos = rw == WRITE && drbd_should_send_out_of_sync(mdev->state);
-		D_ASSERT(!(remote && send_oos));
+	/* no more giving up req_lock from now on! */

-		if (!(remote || send_oos))
-			dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
-		if (!(local || remote)) {
-			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
-			spin_unlock_irq(&mdev->tconn->req_lock);
-			err = -EIO;
-			goto fail_free_complete;
+	if (drbd_suspended(mdev)) {
+		/* push back and retry: */
+		req->rq_state |= RQ_POSTPONED;
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
 		}
+		goto out;
 	}

 	/* Update disk stats */
 	_drbd_start_io_acct(mdev, req, bio);

-	/* NOTE
-	 * Actually, 'local' may be wrong here already, since we may have failed
-	 * to write to the meta data, and may become wrong anytime because of
-	 * local io-error for some other request, which would lead to us
-	 * "detaching" the local disk.
-	 *
-	 * 'remote' may become wrong any time because the network could fail.
-	 *
-	 * This is a harmless race condition, though, since it is handled
-	 * correctly at the appropriate places; so it just defers the failure
-	 * of the respective operation.
-	 */
-
-	/* mark them early for readability.
-	 * this just sets some state flags. */
-	if (remote)
-		_req_mod(req, TO_BE_SENT);
-	if (local)
-		_req_mod(req, TO_BE_SUBMITTED);
+	/* We fail READ/READA early, if we can not serve it.
+	 * We must do this before req is registered on any lists.
+	 * Otherwise, req_may_be_completed() will queue failed READ for retry. */
+	if (rw != WRITE) {
+		if (!do_remote_read(req) && !req->private_bio)
+			goto nodata;
+	}

 	/* which transfer log epoch does this belong to? */
 	req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
@@ -948,90 +1018,43 @@ int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long s

 	list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);

-	/* NOTE remote first: to get the concurrent write detection right,
-	 * we must register the request before start of local IO.  */
-	if (remote) {
-		/* either WRITE and C_CONNECTED,
-		 * or READ, and no local disk,
-		 * or READ, but not in sync.
-		 */
-		_req_mod(req, (rw == WRITE)
-				? QUEUE_FOR_NET_WRITE
-				: QUEUE_FOR_NET_READ);
+	if (rw == WRITE) {
+		if (!drbd_process_write_request(req))
+			no_remote = true;
+	} else {
+		/* We either have a private_bio, or we can read from remote.
+		 * Otherwise we had done the goto nodata above. */
+		if (req->private_bio == NULL) {
+			_req_mod(req, TO_BE_SENT);
+			_req_mod(req, QUEUE_FOR_NET_READ);
+		} else
+			no_remote = true;
 	}
-	if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
-		_req_mod(req, QUEUE_FOR_SEND_OOS);

-	rcu_read_lock();
-	nc = rcu_dereference(mdev->tconn->net_conf);
-	if (remote &&
-	    nc->on_congestion != OC_BLOCK && mdev->tconn->agreed_pro_version >= 96) {
-		int congested = 0;
-
-		if (nc->cong_fill &&
-		    atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
-			dev_info(DEV, "Congestion-fill threshold reached\n");
-			congested = 1;
-		}
-
-		if (mdev->act_log->used >= nc->cong_extents) {
-			dev_info(DEV, "Congestion-extents threshold reached\n");
-			congested = 1;
-		}
-
-		if (congested) {
-			if (mdev->tconn->current_tle_writes)
-				/* start a new epoch for non-mirrored writes */
-				start_new_tl_epoch(mdev->tconn);
-
-			if (nc->on_congestion == OC_PULL_AHEAD)
-				_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
-			else  /*nc->on_congestion == OC_DISCONNECT */
-				_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
-		}
+	if (req->private_bio) {
+		/* needs to be marked within the same spinlock */
+		_req_mod(req, TO_BE_SUBMITTED);
+		/* but we need to give up the spinlock to submit */
+		spin_unlock_irq(&mdev->tconn->req_lock);
+		drbd_submit_req_private_bio(req);
+		/* once we have submitted, we must no longer look at req,
+		 * it may already be destroyed. */
+		return 0;
+	} else if (no_remote) {
+nodata:
+		if (__ratelimit(&drbd_ratelimit_state))
+			dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
+		/* A write may have been queued for send_oos, however.
+		 * So we can not simply free it, we must go through req_may_be_completed() */
 	}
-	rcu_read_unlock();

+out:
+	req_may_be_completed(req, &m);
 	spin_unlock_irq(&mdev->tconn->req_lock);

-	if (local) {
-		req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-
-		/* State may have changed since we grabbed our reference on the
-		 * mdev->ldev member. Double check, and short-circuit to endio.
-		 * In case the last activity log transaction failed to get on
-		 * stable storage, and this is a WRITE, we may not even submit
-		 * this bio. */
-		if (get_ldev(mdev)) {
-			if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
-						    : rw == READ  ? DRBD_FAULT_DT_RD
-						    :               DRBD_FAULT_DT_RA))
-				bio_endio(req->private_bio, -EIO);
-			else
-				generic_make_request(req->private_bio);
-			put_ldev(mdev);
-		} else
-			bio_endio(req->private_bio, -EIO);
-	}
-
+	if (m.bio)
+		complete_master_bio(mdev, &m);
 	return 0;
-
-fail_free_complete:
-	if (req->rq_state & RQ_IN_ACT_LOG)
-		drbd_al_complete_io(mdev, &req->i);
-fail_and_free_req:
-	if (local) {
-		bio_put(req->private_bio);
-		req->private_bio = NULL;
-		put_ldev(mdev);
-	}
-	if (!ret)
-		bio_endio(bio, err);
-
-	drbd_req_free(req);
-	dec_ap_bio(mdev);
-
-	return ret;
 }

 int drbd_make_request(struct request_queue *q, struct bio *bio)