io_uring: make CQ ring wakeups be more efficient

For batched IO, it's not uncommon for waiters to ask for more than 1 IO to complete before being woken up. This is a problem with wait_event() since tasks will get woken for every IO that completes, re-check condition, then go back to sleep. For batch counts on the order of what you do for high IOPS, that can result in 10s of extra wakeups for the waiting task. Add a private wake function that checks for the wake up count criteria being met before calling autoremove_wake_function(). Pavel reports that one test case he has runs 40% faster with proper batching of wakeups. Reported-by: Pavel Begunkov <asml.silence@gmail.com> Tested-by: Pavel Begunkov <asml.silence@gmail.com> Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>

io_uring: make CQ ring wakeups be more efficient
For batched IO, it's not uncommon for waiters to ask for more than 1 IO to complete before being woken up. This is a problem with wait_event() since tasks will get woken for every IO that completes, re-check condition, then go back to sleep. For batch counts on the order of what you do for high IOPS, that can result in 10s of extra wakeups for the waiting task. Add a private wake function that checks for the wake up count criteria being met before calling autoremove_wake_function(). Pavel reports that one test case he has runs 40% faster with proper batching of wakeups. Reported-by: Pavel Begunkov <asml.silence@gmail.com> Tested-by: Pavel Begunkov <asml.silence@gmail.com> Reviewed-by: Pavel Begunkov <asml.silence@gmail.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
bda52162 · Jens Axboe · daa5de54 · bda52162
Commit bda52162 authored Sep 24, 2019 by Jens Axboe
Hide whitespace changes
Inline Side-by-side

Showing with 56 additions and 10 deletions

fs/io_uring.c fs/io_uring.c +56 -10

No files found.
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 	return submit;
 }
+struct io_wait_queue {
+	struct wait_queue_entry wq;
+	struct io_ring_ctx *ctx;
+	unsigned to_wait;
+	unsigned nr_timeouts;
+};
+static inline bool io_should_wake(struct io_wait_queue *iowq)
+{
+	struct io_ring_ctx *ctx = iowq->ctx;
+	/*
+	 * Wake up if we have enough events, or if a timeout occured since we
+	 * started waiting. For timeouts, we always want to return to userspace,
+	 * regardless of event count.
+	 */
+	return io_cqring_events(ctx->rings) >= iowq->to_wait ||
+			atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
+}
+static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
+			    int wake_flags, void *key)
+{
+	struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
+							wq);
+	if (!io_should_wake(iowq))
+		return -1;
+	return autoremove_wake_function(curr, mode, wake_flags, key);
+}
 /*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
@@ -2775,8 +2807,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
 static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			  const sigset_t __user *sig, size_t sigsz)
 {
+	struct io_wait_queue iowq = {
+		.wq = {
+			.private	= current,
+			.func		= io_wake_function,
+			.entry		= LIST_HEAD_INIT(iowq.wq.entry),
+		},
+		.ctx		= ctx,
+		.to_wait	= min_events,
+	};
 	struct io_rings *rings = ctx->rings;
-	unsigned nr_timeouts;
 	int ret;
 	if (io_cqring_events(rings) >= min_events)
@@ -2795,15 +2835,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 			return ret;
 	}
-	nr_timeouts = atomic_read(&ctx->cq_timeouts);
+	ret = 0;
-	/*
+	iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
-	 * Return if we have enough events, or if a timeout occured since
+	do {
-	 * we started waiting. For timeouts, we always want to return to
+		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
-	 * userspace.
+						TASK_INTERRUPTIBLE);
-	 */
+		if (io_should_wake(&iowq))
-	ret = wait_event_interruptible(ctx->wait,
+			break;
-				io_cqring_events(rings) >= min_events ||
+		schedule();
-				atomic_read(&ctx->cq_timeouts) != nr_timeouts);
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+	} while (1);
+	finish_wait(&ctx->wait, &iowq.wq);
 	restore_saved_sigmask_unless(ret == -ERESTARTSYS);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;