Commit bda52162 authored by Jens Axboe's avatar Jens Axboe

io_uring: make CQ ring wakeups be more efficient

For batched IO, it's not uncommon for waiters to ask for more than 1
IO to complete before being woken up. This is a problem with
wait_event() since tasks will get woken for every IO that completes,
re-check condition, then go back to sleep. For batch counts on the
order of what you do for high IOPS, that can result in 10s of extra
wakeups for the waiting task.

Add a private wake function that checks for the wake up count criteria
being met before calling autoremove_wake_function(). Pavel reports that
one test case he has runs 40% faster with proper batching of wakeups.
Reported-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Tested-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Reviewed-by: default avatarPavel Begunkov <asml.silence@gmail.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent daa5de54
...@@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, ...@@ -2768,6 +2768,38 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
return submit; return submit;
} }
struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
unsigned to_wait;
unsigned nr_timeouts;
};
static inline bool io_should_wake(struct io_wait_queue *iowq)
{
struct io_ring_ctx *ctx = iowq->ctx;
/*
* Wake up if we have enough events, or if a timeout occured since we
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
return io_cqring_events(ctx->rings) >= iowq->to_wait ||
atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
int wake_flags, void *key)
{
struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue,
wq);
if (!io_should_wake(iowq))
return -1;
return autoremove_wake_function(curr, mode, wake_flags, key);
}
/* /*
* Wait until events become available, if we don't already have some. The * Wait until events become available, if we don't already have some. The
* application must reap them itself, as they reside on the shared cq ring. * application must reap them itself, as they reside on the shared cq ring.
...@@ -2775,8 +2807,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, ...@@ -2775,8 +2807,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz) const sigset_t __user *sig, size_t sigsz)
{ {
struct io_wait_queue iowq = {
.wq = {
.private = current,
.func = io_wake_function,
.entry = LIST_HEAD_INIT(iowq.wq.entry),
},
.ctx = ctx,
.to_wait = min_events,
};
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
unsigned nr_timeouts;
int ret; int ret;
if (io_cqring_events(rings) >= min_events) if (io_cqring_events(rings) >= min_events)
...@@ -2795,15 +2835,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ...@@ -2795,15 +2835,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return ret; return ret;
} }
nr_timeouts = atomic_read(&ctx->cq_timeouts); ret = 0;
/* iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts);
* Return if we have enough events, or if a timeout occured since do {
* we started waiting. For timeouts, we always want to return to prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
* userspace. TASK_INTERRUPTIBLE);
*/ if (io_should_wake(&iowq))
ret = wait_event_interruptible(ctx->wait, break;
io_cqring_events(rings) >= min_events || schedule();
atomic_read(&ctx->cq_timeouts) != nr_timeouts); if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
restore_saved_sigmask_unless(ret == -ERESTARTSYS); restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS) if (ret == -ERESTARTSYS)
ret = -EINTR; ret = -EINTR;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment