Commit 96f7e448 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.2/io_uring-next-2022-12-08' of git://git.kernel.dk/linux

Pull io_uring updates part two from Jens Axboe:

 - Misc fixes (me, Lin)

 - Series from Pavel extending the single task exclusive ring mode,
   yielding nice improvements for the common case of having a single
   ring per thread (Pavel)

 - Cleanup for MSG_RING, removing our IOPOLL hack (Pavel)

 - Further poll cleanups and fixes (Pavel)

 - Misc cleanups and fixes (Pavel)

* tag 'for-6.2/io_uring-next-2022-12-08' of git://git.kernel.dk/linux: (22 commits)
  io_uring/msg_ring: flag target ring as having task_work, if needed
  io_uring: skip spinlocking for ->task_complete
  io_uring: do msg_ring in target task via tw
  io_uring: extract a io_msg_install_complete helper
  io_uring: get rid of double locking
  io_uring: never run tw and fallback in parallel
  io_uring: use tw for putting rsrc
  io_uring: force multishot CQEs into task context
  io_uring: complete all requests in task context
  io_uring: don't check overflow flush failures
  io_uring: skip overflow CQE posting for dying ring
  io_uring: improve io_double_lock_ctx fail handling
  io_uring: dont remove file from msg_ring reqs
  io_uring: reshuffle issue_flags
  io_uring: don't reinstall quiesce node for each tw
  io_uring: improve rsrc quiesce refs checks
  io_uring: don't raw spin unlock to match cq_lock
  io_uring: combine poll tw handlers
  io_uring: improve poll warning handling
  io_uring: remove ctx variable in io_poll_check_events
  ...
parents 54e60e50 761c61c1
...@@ -9,16 +9,17 @@ ...@@ -9,16 +9,17 @@
enum io_uring_cmd_flags { enum io_uring_cmd_flags {
IO_URING_F_COMPLETE_DEFER = 1, IO_URING_F_COMPLETE_DEFER = 1,
IO_URING_F_UNLOCKED = 2, IO_URING_F_UNLOCKED = 2,
/* the request is executed from poll, it should not be freed */
IO_URING_F_MULTISHOT = 4,
/* executed by io-wq */
IO_URING_F_IOWQ = 8,
/* int's last bit, sign checks are usually faster than a bit test */ /* int's last bit, sign checks are usually faster than a bit test */
IO_URING_F_NONBLOCK = INT_MIN, IO_URING_F_NONBLOCK = INT_MIN,
/* ctx state flags, for URING_CMD */ /* ctx state flags, for URING_CMD */
IO_URING_F_SQE128 = 4, IO_URING_F_SQE128 = (1 << 8),
IO_URING_F_CQE32 = 8, IO_URING_F_CQE32 = (1 << 9),
IO_URING_F_IOPOLL = 16, IO_URING_F_IOPOLL = (1 << 10),
/* the request is executed from poll, it should not be freed */
IO_URING_F_MULTISHOT = 32,
}; };
struct io_uring_cmd { struct io_uring_cmd {
......
...@@ -208,6 +208,8 @@ struct io_ring_ctx { ...@@ -208,6 +208,8 @@ struct io_ring_ctx {
unsigned int drain_disabled: 1; unsigned int drain_disabled: 1;
unsigned int has_evfd: 1; unsigned int has_evfd: 1;
unsigned int syscall_iopoll: 1; unsigned int syscall_iopoll: 1;
/* all CQEs should be posted only by the submitter task */
unsigned int task_complete: 1;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
/* submission data */ /* submission data */
...@@ -326,6 +328,7 @@ struct io_ring_ctx { ...@@ -326,6 +328,7 @@ struct io_ring_ctx {
struct io_rsrc_data *buf_data; struct io_rsrc_data *buf_data;
struct delayed_work rsrc_put_work; struct delayed_work rsrc_put_work;
struct callback_head rsrc_put_tw;
struct llist_head rsrc_put_llist; struct llist_head rsrc_put_llist;
struct list_head rsrc_ref_list; struct list_head rsrc_ref_list;
spinlock_t rsrc_ref_lock; spinlock_t rsrc_ref_lock;
......
This diff is collapsed.
...@@ -93,6 +93,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) ...@@ -93,6 +93,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
} }
static inline void io_cq_unlock(struct io_ring_ctx *ctx)
{
spin_unlock(&ctx->completion_lock);
}
void io_cq_unlock_post(struct io_ring_ctx *ctx); void io_cq_unlock_post(struct io_ring_ctx *ctx);
static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
...@@ -128,7 +133,7 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, ...@@ -128,7 +133,7 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
*/ */
cqe = io_get_cqe(ctx); cqe = io_get_cqe(ctx);
if (unlikely(!cqe)) if (unlikely(!cqe))
return io_req_cqe_overflow(req); return false;
trace_io_uring_complete(req->ctx, req, req->cqe.user_data, trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
req->cqe.res, req->cqe.flags, req->cqe.res, req->cqe.flags,
...@@ -151,6 +156,14 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, ...@@ -151,6 +156,14 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx,
return true; return true;
} }
static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if (likely(__io_fill_cqe_req(ctx, req)))
return true;
return io_req_cqe_overflow(req);
}
static inline void req_set_fail(struct io_kiocb *req) static inline void req_set_fail(struct io_kiocb *req)
{ {
req->flags |= REQ_F_FAIL; req->flags |= REQ_F_FAIL;
......
...@@ -15,6 +15,8 @@ ...@@ -15,6 +15,8 @@
struct io_msg { struct io_msg {
struct file *file; struct file *file;
struct file *src_file;
struct callback_head tw;
u64 user_data; u64 user_data;
u32 len; u32 len;
u32 cmd; u32 cmd;
...@@ -23,6 +25,34 @@ struct io_msg { ...@@ -23,6 +25,34 @@ struct io_msg {
u32 flags; u32 flags;
}; };
void io_msg_ring_cleanup(struct io_kiocb *req)
{
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
if (WARN_ON_ONCE(!msg->src_file))
return;
fput(msg->src_file);
msg->src_file = NULL;
}
static void io_msg_tw_complete(struct callback_head *head)
{
struct io_msg *msg = container_of(head, struct io_msg, tw);
struct io_kiocb *req = cmd_to_io_kiocb(msg);
struct io_ring_ctx *target_ctx = req->file->private_data;
int ret = 0;
if (current->flags & PF_EXITING)
ret = -EOWNERDEAD;
else if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
ret = -EOVERFLOW;
if (ret < 0)
req_set_fail(req);
io_req_queue_tw_complete(req, ret);
}
static int io_msg_ring_data(struct io_kiocb *req) static int io_msg_ring_data(struct io_kiocb *req)
{ {
struct io_ring_ctx *target_ctx = req->file->private_data; struct io_ring_ctx *target_ctx = req->file->private_data;
...@@ -31,23 +61,29 @@ static int io_msg_ring_data(struct io_kiocb *req) ...@@ -31,23 +61,29 @@ static int io_msg_ring_data(struct io_kiocb *req)
if (msg->src_fd || msg->dst_fd || msg->flags) if (msg->src_fd || msg->dst_fd || msg->flags)
return -EINVAL; return -EINVAL;
if (target_ctx->task_complete && current != target_ctx->submitter_task) {
init_task_work(&msg->tw, io_msg_tw_complete);
if (task_work_add(target_ctx->submitter_task, &msg->tw,
TWA_SIGNAL_NO_IPI))
return -EOWNERDEAD;
atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags);
return IOU_ISSUE_SKIP_COMPLETE;
}
if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
return 0; return 0;
return -EOVERFLOW; return -EOVERFLOW;
} }
static void io_double_unlock_ctx(struct io_ring_ctx *ctx, static void io_double_unlock_ctx(struct io_ring_ctx *octx,
struct io_ring_ctx *octx,
unsigned int issue_flags) unsigned int issue_flags)
{ {
if (issue_flags & IO_URING_F_UNLOCKED)
mutex_unlock(&ctx->uring_lock);
mutex_unlock(&octx->uring_lock); mutex_unlock(&octx->uring_lock);
} }
static int io_double_lock_ctx(struct io_ring_ctx *ctx, static int io_double_lock_ctx(struct io_ring_ctx *octx,
struct io_ring_ctx *octx,
unsigned int issue_flags) unsigned int issue_flags)
{ {
/* /*
...@@ -60,56 +96,49 @@ static int io_double_lock_ctx(struct io_ring_ctx *ctx, ...@@ -60,56 +96,49 @@ static int io_double_lock_ctx(struct io_ring_ctx *ctx,
return -EAGAIN; return -EAGAIN;
return 0; return 0;
} }
mutex_lock(&octx->uring_lock);
/* Always grab smallest value ctx first. We know ctx != octx. */
if (ctx < octx) {
mutex_lock(&ctx->uring_lock);
mutex_lock(&octx->uring_lock);
} else {
mutex_lock(&octx->uring_lock);
mutex_lock(&ctx->uring_lock);
}
return 0; return 0;
} }
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct file *file = NULL;
unsigned long file_ptr; unsigned long file_ptr;
struct file *src_file; int idx = msg->src_fd;
int ret;
io_ring_submit_lock(ctx, issue_flags);
if (target_ctx == ctx) if (likely(idx < ctx->nr_user_files)) {
return -EINVAL; idx = array_index_nospec(idx, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr;
ret = io_double_lock_ctx(ctx, target_ctx, issue_flags); file = (struct file *) (file_ptr & FFS_MASK);
if (unlikely(ret)) if (file)
return ret; get_file(file);
}
ret = -EBADF; io_ring_submit_unlock(ctx, issue_flags);
if (unlikely(msg->src_fd >= ctx->nr_user_files)) return file;
goto out_unlock; }
msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags)
file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; {
if (!file_ptr) struct io_ring_ctx *target_ctx = req->file->private_data;
goto out_unlock; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct file *src_file = msg->src_file;
int ret;
src_file = (struct file *) (file_ptr & FFS_MASK); if (unlikely(io_double_lock_ctx(target_ctx, issue_flags)))
get_file(src_file); return -EAGAIN;
ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd);
if (ret < 0) { if (ret < 0)
fput(src_file);
goto out_unlock; goto out_unlock;
}
msg->src_file = NULL;
req->flags &= ~REQ_F_NEED_CLEANUP;
if (msg->flags & IORING_MSG_RING_CQE_SKIP) if (msg->flags & IORING_MSG_RING_CQE_SKIP)
goto out_unlock; goto out_unlock;
/* /*
* If this fails, the target still received the file descriptor but * If this fails, the target still received the file descriptor but
* wasn't notified of the fact. This means that if this request * wasn't notified of the fact. This means that if this request
...@@ -119,10 +148,51 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) ...@@ -119,10 +148,51 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0))
ret = -EOVERFLOW; ret = -EOVERFLOW;
out_unlock: out_unlock:
io_double_unlock_ctx(ctx, target_ctx, issue_flags); io_double_unlock_ctx(target_ctx, issue_flags);
return ret; return ret;
} }
static void io_msg_tw_fd_complete(struct callback_head *head)
{
struct io_msg *msg = container_of(head, struct io_msg, tw);
struct io_kiocb *req = cmd_to_io_kiocb(msg);
int ret = -EOWNERDEAD;
if (!(current->flags & PF_EXITING))
ret = io_msg_install_complete(req, IO_URING_F_UNLOCKED);
if (ret < 0)
req_set_fail(req);
io_req_queue_tw_complete(req, ret);
}
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_ring_ctx *ctx = req->ctx;
struct file *src_file = msg->src_file;
if (target_ctx == ctx)
return -EINVAL;
if (!src_file) {
src_file = io_msg_grab_file(req, issue_flags);
if (!src_file)
return -EBADF;
msg->src_file = src_file;
req->flags |= REQ_F_NEED_CLEANUP;
}
if (target_ctx->task_complete && current != target_ctx->submitter_task) {
init_task_work(&msg->tw, io_msg_tw_fd_complete);
if (task_work_add(target_ctx->submitter_task, &msg->tw,
TWA_SIGNAL))
return -EOWNERDEAD;
return IOU_ISSUE_SKIP_COMPLETE;
}
return io_msg_install_complete(req, issue_flags);
}
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
...@@ -130,6 +200,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -130,6 +200,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (unlikely(sqe->buf_index || sqe->personality)) if (unlikely(sqe->buf_index || sqe->personality))
return -EINVAL; return -EINVAL;
msg->src_file = NULL;
msg->user_data = READ_ONCE(sqe->off); msg->user_data = READ_ONCE(sqe->off);
msg->len = READ_ONCE(sqe->len); msg->len = READ_ONCE(sqe->len);
msg->cmd = READ_ONCE(sqe->addr); msg->cmd = READ_ONCE(sqe->addr);
...@@ -164,12 +235,11 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) ...@@ -164,12 +235,11 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
} }
done: done:
if (ret < 0) if (ret < 0) {
if (ret == -EAGAIN || ret == IOU_ISSUE_SKIP_COMPLETE)
return ret;
req_set_fail(req); req_set_fail(req);
}
io_req_set_res(req, ret, 0); io_req_set_res(req, ret, 0);
/* put file to avoid an attempt to IOPOLL the req */
if (!(req->flags & REQ_F_FIXED_FILE))
io_put_file(req->file);
req->file = NULL;
return IOU_OK; return IOU_OK;
} }
...@@ -2,3 +2,4 @@ ...@@ -2,3 +2,4 @@
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req);
...@@ -67,6 +67,19 @@ struct io_sr_msg { ...@@ -67,6 +67,19 @@ struct io_sr_msg {
struct io_kiocb *notif; struct io_kiocb *notif;
}; };
static inline bool io_check_multishot(struct io_kiocb *req,
unsigned int issue_flags)
{
/*
* When ->locked_cq is set we only allow to post CQEs from the original
* task context. Usual request completions will be handled in other
* generic paths but multipoll may decide to post extra cqes.
*/
return !(issue_flags & IO_URING_F_IOWQ) ||
!(issue_flags & IO_URING_F_MULTISHOT) ||
!req->ctx->task_complete;
}
int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown);
...@@ -730,6 +743,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ...@@ -730,6 +743,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST)) (sr->flags & IORING_RECVSEND_POLL_FIRST))
return io_setup_async_msg(req, kmsg, issue_flags); return io_setup_async_msg(req, kmsg, issue_flags);
if (!io_check_multishot(req, issue_flags))
return io_setup_async_msg(req, kmsg, issue_flags);
retry_multishot: retry_multishot:
if (io_do_buffer_select(req)) { if (io_do_buffer_select(req)) {
void __user *buf; void __user *buf;
...@@ -829,6 +845,9 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) ...@@ -829,6 +845,9 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
(sr->flags & IORING_RECVSEND_POLL_FIRST)) (sr->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN; return -EAGAIN;
if (!io_check_multishot(req, issue_flags))
return -EAGAIN;
sock = sock_from_file(req->file); sock = sock_from_file(req->file);
if (unlikely(!sock)) if (unlikely(!sock))
return -ENOTSOCK; return -ENOTSOCK;
...@@ -1280,6 +1299,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) ...@@ -1280,6 +1299,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags)
struct file *file; struct file *file;
int ret, fd; int ret, fd;
if (!io_check_multishot(req, issue_flags))
return -EAGAIN;
retry: retry:
if (!fixed) { if (!fixed) {
fd = __get_unused_fd_flags(accept->flags, accept->nofile); fd = __get_unused_fd_flags(accept->flags, accept->nofile);
......
...@@ -63,6 +63,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -63,6 +63,7 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.name = "READV", .name = "READV",
.prep = io_prep_rw, .prep = io_prep_rw,
...@@ -80,6 +81,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -80,6 +81,7 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.name = "WRITEV", .name = "WRITEV",
.prep = io_prep_rw, .prep = io_prep_rw,
...@@ -103,6 +105,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -103,6 +105,7 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.name = "READ_FIXED", .name = "READ_FIXED",
.prep = io_prep_rw, .prep = io_prep_rw,
...@@ -118,6 +121,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -118,6 +121,7 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.name = "WRITE_FIXED", .name = "WRITE_FIXED",
.prep = io_prep_rw, .prep = io_prep_rw,
...@@ -277,6 +281,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -277,6 +281,7 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.name = "READ", .name = "READ",
.prep = io_prep_rw, .prep = io_prep_rw,
...@@ -292,6 +297,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -292,6 +297,7 @@ const struct io_op_def io_op_defs[] = {
.audit_skip = 1, .audit_skip = 1,
.ioprio = 1, .ioprio = 1,
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = sizeof(struct io_async_rw), .async_size = sizeof(struct io_async_rw),
.name = "WRITE", .name = "WRITE",
.prep = io_prep_rw, .prep = io_prep_rw,
...@@ -439,6 +445,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -439,6 +445,7 @@ const struct io_op_def io_op_defs[] = {
.name = "MSG_RING", .name = "MSG_RING",
.prep = io_msg_ring_prep, .prep = io_msg_ring_prep,
.issue = io_msg_ring, .issue = io_msg_ring,
.cleanup = io_msg_ring_cleanup,
}, },
[IORING_OP_FSETXATTR] = { [IORING_OP_FSETXATTR] = {
.needs_file = 1, .needs_file = 1,
...@@ -481,6 +488,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -481,6 +488,7 @@ const struct io_op_def io_op_defs[] = {
.plug = 1, .plug = 1,
.name = "URING_CMD", .name = "URING_CMD",
.iopoll = 1, .iopoll = 1,
.iopoll_queue = 1,
.async_size = uring_cmd_pdu_size(1), .async_size = uring_cmd_pdu_size(1),
.prep = io_uring_cmd_prep, .prep = io_uring_cmd_prep,
.issue = io_uring_cmd, .issue = io_uring_cmd,
......
...@@ -25,6 +25,8 @@ struct io_op_def { ...@@ -25,6 +25,8 @@ struct io_op_def {
unsigned ioprio : 1; unsigned ioprio : 1;
/* supports iopoll */ /* supports iopoll */
unsigned iopoll : 1; unsigned iopoll : 1;
/* have to be put into the iopoll list */
unsigned iopoll_queue : 1;
/* opcode specific path will handle ->async_data allocation if needed */ /* opcode specific path will handle ->async_data allocation if needed */
unsigned manual_alloc : 1; unsigned manual_alloc : 1;
/* size of async data needed, if any */ /* size of async data needed, if any */
......
...@@ -237,7 +237,6 @@ enum { ...@@ -237,7 +237,6 @@ enum {
*/ */
static int io_poll_check_events(struct io_kiocb *req, bool *locked) static int io_poll_check_events(struct io_kiocb *req, bool *locked)
{ {
struct io_ring_ctx *ctx = req->ctx;
int v, ret; int v, ret;
/* req->task == current here, checking PF_EXITING is safe */ /* req->task == current here, checking PF_EXITING is safe */
...@@ -247,27 +246,30 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) ...@@ -247,27 +246,30 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
do { do {
v = atomic_read(&req->poll_refs); v = atomic_read(&req->poll_refs);
/* tw handler should be the owner, and so have some references */ if (unlikely(v != 1)) {
if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) /* tw should be the owner and so have some refs */
return IOU_POLL_DONE; if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK)))
if (v & IO_POLL_CANCEL_FLAG) return IOU_POLL_NO_ACTION;
return -ECANCELED; if (v & IO_POLL_CANCEL_FLAG)
/* return -ECANCELED;
* cqe.res contains only events of the first wake up
* and all others are be lost. Redo vfs_poll() to get
* up to date state.
*/
if ((v & IO_POLL_REF_MASK) != 1)
req->cqe.res = 0;
if (v & IO_POLL_RETRY_FLAG) {
req->cqe.res = 0;
/* /*
* We won't find new events that came in between * cqe.res contains only events of the first wake up
* vfs_poll and the ref put unless we clear the flag * and all others are to be lost. Redo vfs_poll() to get
* in advance. * up to date state.
*/ */
atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); if ((v & IO_POLL_REF_MASK) != 1)
v &= ~IO_POLL_RETRY_FLAG; req->cqe.res = 0;
if (v & IO_POLL_RETRY_FLAG) {
req->cqe.res = 0;
/*
* We won't find new events that came in between
* vfs_poll and the ref put unless we clear the
* flag in advance.
*/
atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
v &= ~IO_POLL_RETRY_FLAG;
}
} }
/* the mask was stashed in __io_poll_execute */ /* the mask was stashed in __io_poll_execute */
...@@ -286,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) ...@@ -286,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
__poll_t mask = mangle_poll(req->cqe.res & __poll_t mask = mangle_poll(req->cqe.res &
req->apoll_events); req->apoll_events);
if (!io_aux_cqe(ctx, *locked, req->cqe.user_data, if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data,
mask, IORING_CQE_F_MORE, false)) { mask, IORING_CQE_F_MORE, false)) {
io_req_set_res(req, mask, 0); io_req_set_res(req, mask, 0);
return IOU_POLL_REMOVE_POLL_USE_RES; return IOU_POLL_REMOVE_POLL_USE_RES;
...@@ -319,50 +321,38 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) ...@@ -319,50 +321,38 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
ret = io_poll_check_events(req, locked); ret = io_poll_check_events(req, locked);
if (ret == IOU_POLL_NO_ACTION) if (ret == IOU_POLL_NO_ACTION)
return; return;
if (ret == IOU_POLL_DONE) {
struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll);
req->cqe.res = mangle_poll(req->cqe.res & poll->events);
} else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
req->cqe.res = ret;
req_set_fail(req);
}
io_poll_remove_entries(req); io_poll_remove_entries(req);
io_poll_tw_hash_eject(req, locked); io_poll_tw_hash_eject(req, locked);
io_req_set_res(req, req->cqe.res, 0); if (req->opcode == IORING_OP_POLL_ADD) {
io_req_task_complete(req, locked); if (ret == IOU_POLL_DONE) {
} struct io_poll *poll;
static void io_apoll_task_func(struct io_kiocb *req, bool *locked)
{
int ret;
ret = io_poll_check_events(req, locked);
if (ret == IOU_POLL_NO_ACTION)
return;
io_tw_lock(req->ctx, locked); poll = io_kiocb_to_cmd(req, struct io_poll);
io_poll_remove_entries(req); req->cqe.res = mangle_poll(req->cqe.res & poll->events);
io_poll_tw_hash_eject(req, locked); } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) {
req->cqe.res = ret;
req_set_fail(req);
}
if (ret == IOU_POLL_REMOVE_POLL_USE_RES) io_req_set_res(req, req->cqe.res, 0);
io_req_task_complete(req, locked); io_req_task_complete(req, locked);
else if (ret == IOU_POLL_DONE) } else {
io_req_task_submit(req, locked); io_tw_lock(req->ctx, locked);
else
io_req_defer_failed(req, ret); if (ret == IOU_POLL_REMOVE_POLL_USE_RES)
io_req_task_complete(req, locked);
else if (ret == IOU_POLL_DONE)
io_req_task_submit(req, locked);
else
io_req_defer_failed(req, ret);
}
} }
static void __io_poll_execute(struct io_kiocb *req, int mask) static void __io_poll_execute(struct io_kiocb *req, int mask)
{ {
io_req_set_res(req, mask, 0); io_req_set_res(req, mask, 0);
req->io_task_work.func = io_poll_task_func;
if (req->opcode == IORING_OP_POLL_ADD)
req->io_task_work.func = io_poll_task_func;
else
req->io_task_work.func = io_apoll_task_func;
trace_io_uring_task_add(req, mask); trace_io_uring_task_add(req, mask);
io_req_task_work_add(req); io_req_task_work_add(req);
......
...@@ -204,6 +204,14 @@ void io_rsrc_put_work(struct work_struct *work) ...@@ -204,6 +204,14 @@ void io_rsrc_put_work(struct work_struct *work)
} }
} }
void io_rsrc_put_tw(struct callback_head *cb)
{
struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx,
rsrc_put_tw);
io_rsrc_put_work(&ctx->rsrc_put_work.work);
}
void io_wait_rsrc_data(struct io_rsrc_data *data) void io_wait_rsrc_data(struct io_rsrc_data *data)
{ {
if (data && !atomic_dec_and_test(&data->refs)) if (data && !atomic_dec_and_test(&data->refs))
...@@ -242,8 +250,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) ...@@ -242,8 +250,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref)
} }
spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags);
if (first_add) if (!first_add)
mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); return;
if (ctx->submitter_task) {
if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw,
ctx->notify_method))
return;
}
mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay);
} }
static struct io_rsrc_node *io_rsrc_node_alloc(void) static struct io_rsrc_node *io_rsrc_node_alloc(void)
...@@ -309,46 +324,41 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ...@@ -309,46 +324,41 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
/* As we may drop ->uring_lock, other task may have started quiesce */ /* As we may drop ->uring_lock, other task may have started quiesce */
if (data->quiesce) if (data->quiesce)
return -ENXIO; return -ENXIO;
ret = io_rsrc_node_switch_start(ctx);
if (ret)
return ret;
io_rsrc_node_switch(ctx, data);
/* kill initial ref, already quiesced if zero */
if (atomic_dec_and_test(&data->refs))
return 0;
data->quiesce = true; data->quiesce = true;
mutex_unlock(&ctx->uring_lock);
do { do {
ret = io_rsrc_node_switch_start(ctx);
if (ret)
break;
io_rsrc_node_switch(ctx, data);
/* kill initial ref, already quiesced if zero */
if (atomic_dec_and_test(&data->refs))
break;
mutex_unlock(&ctx->uring_lock);
ret = io_run_task_work_sig(ctx); ret = io_run_task_work_sig(ctx);
if (ret < 0) if (ret < 0) {
goto reinit; atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
mutex_lock(&ctx->uring_lock);
break;
}
flush_delayed_work(&ctx->rsrc_put_work); flush_delayed_work(&ctx->rsrc_put_work);
ret = wait_for_completion_interruptible(&data->done); ret = wait_for_completion_interruptible(&data->done);
if (!ret) { if (!ret) {
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
if (atomic_read(&data->refs) > 0) { if (atomic_read(&data->refs) <= 0)
/*
* it has been revived by another thread while
* we were unlocked
*/
mutex_unlock(&ctx->uring_lock);
} else {
break; break;
} /*
* it has been revived by another thread while
* we were unlocked
*/
mutex_unlock(&ctx->uring_lock);
} }
} while (1);
reinit:
atomic_inc(&data->refs);
/* wait for all works potentially completing data->done */
flush_delayed_work(&ctx->rsrc_put_work);
reinit_completion(&data->done);
mutex_lock(&ctx->uring_lock);
} while (ret >= 0);
data->quiesce = false; data->quiesce = false;
return ret; return ret;
......
...@@ -53,6 +53,7 @@ struct io_mapped_ubuf { ...@@ -53,6 +53,7 @@ struct io_mapped_ubuf {
struct bio_vec bvec[]; struct bio_vec bvec[];
}; };
void io_rsrc_put_tw(struct callback_head *cb);
void io_rsrc_put_work(struct work_struct *work); void io_rsrc_put_work(struct work_struct *work);
void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_rsrc_refs_refill(struct io_ring_ctx *ctx);
void io_wait_rsrc_data(struct io_rsrc_data *data); void io_wait_rsrc_data(struct io_rsrc_data *data);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment