Commit cec53f4c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-6.0-2022-09-02' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:

 - A single fix for over-eager retries for networking (Pavel)

 - Revert the notification slot support for zerocopy sends.

   It turns out that even after more than a year or development and
   testing, there's not full agreement on whether just using plain
   ordered notifications is Good Enough to avoid the complexity of using
   the notifications slots. Because of that, we decided that it's best
   left to a future final decision.

   We can always bring back this feature, but we can't really change it
   or remove it once we've released 6.0 with it enabled. The reverts
   leave the usual CQE notifications as the primary interface for
   knowing when data was sent, and when it was acked. (Pavel)

* tag 'io_uring-6.0-2022-09-02' of git://git.kernel.dk/linux-block:
  selftests/net: return back io_uring zc send tests
  io_uring/net: simplify zerocopy send user API
  io_uring/notif: remove notif registration
  Revert "io_uring: rename IORING_OP_FILES_UPDATE"
  Revert "io_uring: add zc notification flush requests"
  selftests/net: temporarily disable io_uring zc test
  io_uring/net: fix overexcessive retries
parents 1551f8f2 916d72c1
...@@ -71,8 +71,8 @@ struct io_uring_sqe { ...@@ -71,8 +71,8 @@ struct io_uring_sqe {
__s32 splice_fd_in; __s32 splice_fd_in;
__u32 file_index; __u32 file_index;
struct { struct {
__u16 notification_idx;
__u16 addr_len; __u16 addr_len;
__u16 __pad3[1];
}; };
}; };
union { union {
...@@ -178,8 +178,7 @@ enum io_uring_op { ...@@ -178,8 +178,7 @@ enum io_uring_op {
IORING_OP_FALLOCATE, IORING_OP_FALLOCATE,
IORING_OP_OPENAT, IORING_OP_OPENAT,
IORING_OP_CLOSE, IORING_OP_CLOSE,
IORING_OP_RSRC_UPDATE, IORING_OP_FILES_UPDATE,
IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
IORING_OP_STATX, IORING_OP_STATX,
IORING_OP_READ, IORING_OP_READ,
IORING_OP_WRITE, IORING_OP_WRITE,
...@@ -206,7 +205,7 @@ enum io_uring_op { ...@@ -206,7 +205,7 @@ enum io_uring_op {
IORING_OP_GETXATTR, IORING_OP_GETXATTR,
IORING_OP_SOCKET, IORING_OP_SOCKET,
IORING_OP_URING_CMD, IORING_OP_URING_CMD,
IORING_OP_SENDZC_NOTIF, IORING_OP_SEND_ZC,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
...@@ -228,7 +227,6 @@ enum io_uring_op { ...@@ -228,7 +227,6 @@ enum io_uring_op {
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/* /*
* sqe->splice_flags * sqe->splice_flags
* extends splice(2) flags * extends splice(2) flags
...@@ -281,29 +279,16 @@ enum io_uring_op { ...@@ -281,29 +279,16 @@ enum io_uring_op {
* *
* IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
* the buf_index field. * the buf_index field.
*
* IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful
* successful. Only for zerocopy sends.
*/ */
#define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECVSEND_POLL_FIRST (1U << 0)
#define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECV_MULTISHOT (1U << 1)
#define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_RECVSEND_FIXED_BUF (1U << 2)
#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3)
/* /*
* accept flags stored in sqe->ioprio * accept flags stored in sqe->ioprio
*/ */
#define IORING_ACCEPT_MULTISHOT (1U << 0) #define IORING_ACCEPT_MULTISHOT (1U << 0)
/*
* IORING_OP_RSRC_UPDATE flags
*/
enum {
IORING_RSRC_UPDATE_FILES,
IORING_RSRC_UPDATE_NOTIF,
};
/* /*
* IORING_OP_MSG_RING command types, stored in sqe->addr * IORING_OP_MSG_RING command types, stored in sqe->addr
*/ */
...@@ -341,10 +326,13 @@ struct io_uring_cqe { ...@@ -341,10 +326,13 @@ struct io_uring_cqe {
* IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID * IORING_CQE_F_BUFFER If set, the upper 16 bits are the buffer ID
* IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries * IORING_CQE_F_MORE If set, parent SQE will generate more CQE entries
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends.
*/ */
#define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3)
enum { enum {
IORING_CQE_BUFFER_SHIFT = 16, IORING_CQE_BUFFER_SHIFT = 16,
...@@ -485,10 +473,6 @@ enum { ...@@ -485,10 +473,6 @@ enum {
/* register a range of fixed file slots for automatic slot allocation */ /* register a range of fixed file slots for automatic slot allocation */
IORING_REGISTER_FILE_ALLOC_RANGE = 25, IORING_REGISTER_FILE_ALLOC_RANGE = 25,
/* zerocopy notification API */
IORING_REGISTER_NOTIFIERS = 26,
IORING_UNREGISTER_NOTIFIERS = 27,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST IORING_REGISTER_LAST
}; };
......
...@@ -2640,7 +2640,6 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ...@@ -2640,7 +2640,6 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
io_unregister_personality(ctx, index); io_unregister_personality(ctx, index);
if (ctx->rings) if (ctx->rings)
io_poll_remove_all(ctx, NULL, true); io_poll_remove_all(ctx, NULL, true);
io_notif_unregister(ctx);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
/* failed during ring init, it couldn't have issued any requests */ /* failed during ring init, it couldn't have issued any requests */
...@@ -3839,15 +3838,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -3839,15 +3838,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break; break;
ret = io_register_file_alloc_range(ctx, arg); ret = io_register_file_alloc_range(ctx, arg);
break; break;
case IORING_REGISTER_NOTIFIERS:
ret = io_notif_register(ctx, arg, nr_args);
break;
case IORING_UNREGISTER_NOTIFIERS:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_notif_unregister(ctx);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
...@@ -3933,8 +3923,8 @@ static int __init io_uring_init(void) ...@@ -3933,8 +3923,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, notification_idx); BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
BUILD_BUG_SQE_ELEM(46, __u16, addr_len); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
......
...@@ -65,12 +65,12 @@ struct io_sendzc { ...@@ -65,12 +65,12 @@ struct io_sendzc {
struct file *file; struct file *file;
void __user *buf; void __user *buf;
size_t len; size_t len;
u16 slot_idx;
unsigned msg_flags; unsigned msg_flags;
unsigned flags; unsigned flags;
unsigned addr_len; unsigned addr_len;
void __user *addr; void __user *addr;
size_t done_io; size_t done_io;
struct io_kiocb *notif;
}; };
#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
...@@ -879,17 +879,31 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) ...@@ -879,17 +879,31 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags)
return ret; return ret;
} }
void io_sendzc_cleanup(struct io_kiocb *req)
{
struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
zc->notif->flags |= REQ_F_CQE_SKIP;
io_notif_flush(zc->notif);
zc->notif = NULL;
}
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc); struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *notif;
if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) ||
READ_ONCE(sqe->__pad3[0]))
return -EINVAL;
/* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */
if (req->flags & REQ_F_CQE_SKIP)
return -EINVAL; return -EINVAL;
zc->flags = READ_ONCE(sqe->ioprio); zc->flags = READ_ONCE(sqe->ioprio);
if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH)) IORING_RECVSEND_FIXED_BUF))
return -EINVAL; return -EINVAL;
if (zc->flags & IORING_RECVSEND_FIXED_BUF) { if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
unsigned idx = READ_ONCE(sqe->buf_index); unsigned idx = READ_ONCE(sqe->buf_index);
...@@ -900,11 +914,17 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -900,11 +914,17 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->imu = READ_ONCE(ctx->user_bufs[idx]); req->imu = READ_ONCE(ctx->user_bufs[idx]);
io_req_set_rsrc_node(req, ctx, 0); io_req_set_rsrc_node(req, ctx, 0);
} }
notif = zc->notif = io_alloc_notif(ctx);
if (!notif)
return -ENOMEM;
notif->cqe.user_data = req->cqe.user_data;
notif->cqe.res = 0;
notif->cqe.flags = IORING_CQE_F_NOTIF;
req->flags |= REQ_F_NEED_CLEANUP;
zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
zc->len = READ_ONCE(sqe->len); zc->len = READ_ONCE(sqe->len);
zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
zc->slot_idx = READ_ONCE(sqe->notification_idx);
if (zc->msg_flags & MSG_DONTWAIT) if (zc->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT; req->flags |= REQ_F_NOWAIT;
...@@ -956,7 +976,7 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, ...@@ -956,7 +976,7 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
shinfo->nr_frags = frag; shinfo->nr_frags = frag;
from->bvec += bi.bi_idx; from->bvec += bi.bi_idx;
from->nr_segs -= bi.bi_idx; from->nr_segs -= bi.bi_idx;
from->count = bi.bi_size; from->count -= copied;
from->iov_offset = bi.bi_bvec_done; from->iov_offset = bi.bi_bvec_done;
skb->data_len += copied; skb->data_len += copied;
...@@ -976,33 +996,20 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, ...@@ -976,33 +996,20 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct sockaddr_storage __address, *addr = NULL; struct sockaddr_storage __address, *addr = NULL;
struct io_ring_ctx *ctx = req->ctx;
struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc); struct io_sendzc *zc = io_kiocb_to_cmd(req, struct io_sendzc);
struct io_notif_slot *notif_slot;
struct io_kiocb *notif;
struct msghdr msg; struct msghdr msg;
struct iovec iov; struct iovec iov;
struct socket *sock; struct socket *sock;
unsigned msg_flags; unsigned msg_flags, cflags;
int ret, min_ret = 0; int ret, min_ret = 0;
if (!(req->flags & REQ_F_POLLED) && if (!(req->flags & REQ_F_POLLED) &&
(zc->flags & IORING_RECVSEND_POLL_FIRST)) (zc->flags & IORING_RECVSEND_POLL_FIRST))
return -EAGAIN; return -EAGAIN;
if (issue_flags & IO_URING_F_UNLOCKED)
return -EAGAIN;
sock = sock_from_file(req->file); sock = sock_from_file(req->file);
if (unlikely(!sock)) if (unlikely(!sock))
return -ENOTSOCK; return -ENOTSOCK;
notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
if (!notif_slot)
return -EINVAL;
notif = io_get_notif(ctx, notif_slot);
if (!notif)
return -ENOMEM;
msg.msg_name = NULL; msg.msg_name = NULL;
msg.msg_control = NULL; msg.msg_control = NULL;
msg.msg_controllen = 0; msg.msg_controllen = 0;
...@@ -1033,7 +1040,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) ...@@ -1033,7 +1040,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
&msg.msg_iter); &msg.msg_iter);
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
ret = io_notif_account_mem(notif, zc->len); ret = io_notif_account_mem(zc->notif, zc->len);
if (unlikely(ret)) if (unlikely(ret))
return ret; return ret;
} }
...@@ -1045,7 +1052,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) ...@@ -1045,7 +1052,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
min_ret = iov_iter_count(&msg.msg_iter); min_ret = iov_iter_count(&msg.msg_iter);
msg.msg_flags = msg_flags; msg.msg_flags = msg_flags;
msg.msg_ubuf = &io_notif_to_data(notif)->uarg; msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
msg.sg_from_iter = io_sg_from_iter; msg.sg_from_iter = io_sg_from_iter;
ret = sock_sendmsg(sock, &msg); ret = sock_sendmsg(sock, &msg);
...@@ -1060,18 +1067,22 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) ...@@ -1060,18 +1067,22 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
req->flags |= REQ_F_PARTIAL_IO; req->flags |= REQ_F_PARTIAL_IO;
return io_setup_async_addr(req, addr, issue_flags); return io_setup_async_addr(req, addr, issue_flags);
} }
if (ret < 0 && !zc->done_io)
zc->notif->flags |= REQ_F_CQE_SKIP;
if (ret == -ERESTARTSYS) if (ret == -ERESTARTSYS)
ret = -EINTR; ret = -EINTR;
req_set_fail(req); req_set_fail(req);
} else if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) {
io_notif_slot_flush_submit(notif_slot, 0);
} }
if (ret >= 0) if (ret >= 0)
ret += zc->done_io; ret += zc->done_io;
else if (zc->done_io) else if (zc->done_io)
ret = zc->done_io; ret = zc->done_io;
io_req_set_res(req, ret, 0);
io_notif_flush(zc->notif);
req->flags &= ~REQ_F_NEED_CLEANUP;
cflags = ret >= 0 ? IORING_CQE_F_MORE : 0;
io_req_set_res(req, ret, cflags);
return IOU_OK; return IOU_OK;
} }
......
...@@ -55,6 +55,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags); ...@@ -55,6 +55,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags);
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_sendzc_cleanup(struct io_kiocb *req);
void io_netmsg_cache_free(struct io_cache_entry *entry); void io_netmsg_cache_free(struct io_cache_entry *entry);
#else #else
......
...@@ -42,8 +42,7 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, ...@@ -42,8 +42,7 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
} }
} }
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
struct io_notif_slot *slot)
__must_hold(&ctx->uring_lock) __must_hold(&ctx->uring_lock)
{ {
struct io_kiocb *notif; struct io_kiocb *notif;
...@@ -59,101 +58,23 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx, ...@@ -59,101 +58,23 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
io_get_task_refs(1); io_get_task_refs(1);
notif->rsrc_node = NULL; notif->rsrc_node = NULL;
io_req_set_rsrc_node(notif, ctx, 0); io_req_set_rsrc_node(notif, ctx, 0);
notif->cqe.user_data = slot->tag;
notif->cqe.flags = slot->seq++;
notif->cqe.res = 0;
nd = io_notif_to_data(notif); nd = io_notif_to_data(notif);
nd->account_pages = 0; nd->account_pages = 0;
nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
nd->uarg.callback = io_uring_tx_zerocopy_callback; nd->uarg.callback = io_uring_tx_zerocopy_callback;
/* master ref owned by io_notif_slot, will be dropped on flush */
refcount_set(&nd->uarg.refcnt, 1); refcount_set(&nd->uarg.refcnt, 1);
return notif; return notif;
} }
void io_notif_slot_flush(struct io_notif_slot *slot) void io_notif_flush(struct io_kiocb *notif)
__must_hold(&slot->notif->ctx->uring_lock) __must_hold(&slot->notif->ctx->uring_lock)
{ {
struct io_kiocb *notif = slot->notif;
struct io_notif_data *nd = io_notif_to_data(notif); struct io_notif_data *nd = io_notif_to_data(notif);
slot->notif = NULL;
/* drop slot's master ref */ /* drop slot's master ref */
if (refcount_dec_and_test(&nd->uarg.refcnt)) { if (refcount_dec_and_test(&nd->uarg.refcnt)) {
notif->io_task_work.func = __io_notif_complete_tw; notif->io_task_work.func = __io_notif_complete_tw;
io_req_task_work_add(notif); io_req_task_work_add(notif);
} }
} }
__cold int io_notif_unregister(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{
int i;
if (!ctx->notif_slots)
return -ENXIO;
for (i = 0; i < ctx->nr_notif_slots; i++) {
struct io_notif_slot *slot = &ctx->notif_slots[i];
struct io_kiocb *notif = slot->notif;
struct io_notif_data *nd;
if (!notif)
continue;
nd = io_notif_to_data(notif);
slot->notif = NULL;
if (!refcount_dec_and_test(&nd->uarg.refcnt))
continue;
notif->io_task_work.func = __io_notif_complete_tw;
io_req_task_work_add(notif);
}
kvfree(ctx->notif_slots);
ctx->notif_slots = NULL;
ctx->nr_notif_slots = 0;
return 0;
}
__cold int io_notif_register(struct io_ring_ctx *ctx,
void __user *arg, unsigned int size)
__must_hold(&ctx->uring_lock)
{
struct io_uring_notification_slot __user *slots;
struct io_uring_notification_slot slot;
struct io_uring_notification_register reg;
unsigned i;
if (ctx->nr_notif_slots)
return -EBUSY;
if (size != sizeof(reg))
return -EINVAL;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS)
return -EINVAL;
if (reg.resv || reg.resv2 || reg.resv3)
return -EINVAL;
slots = u64_to_user_ptr(reg.data);
ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]),
GFP_KERNEL_ACCOUNT);
if (!ctx->notif_slots)
return -ENOMEM;
for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) {
struct io_notif_slot *notif_slot = &ctx->notif_slots[i];
if (copy_from_user(&slot, &slots[i], sizeof(slot))) {
io_notif_unregister(ctx);
return -EFAULT;
}
if (slot.resv[0] | slot.resv[1] | slot.resv[2]) {
io_notif_unregister(ctx);
return -EINVAL;
}
notif_slot->tag = slot.tag;
}
return 0;
}
...@@ -8,7 +8,6 @@ ...@@ -8,7 +8,6 @@
#include "rsrc.h" #include "rsrc.h"
#define IO_NOTIF_SPLICE_BATCH 32 #define IO_NOTIF_SPLICE_BATCH 32
#define IORING_MAX_NOTIF_SLOTS (1U << 15)
struct io_notif_data { struct io_notif_data {
struct file *file; struct file *file;
...@@ -16,63 +15,14 @@ struct io_notif_data { ...@@ -16,63 +15,14 @@ struct io_notif_data {
unsigned long account_pages; unsigned long account_pages;
}; };
struct io_notif_slot { void io_notif_flush(struct io_kiocb *notif);
/* struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx);
* Current/active notifier. A slot holds only one active notifier at a
* time and keeps one reference to it. Flush releases the reference and
* lazily replaces it with a new notifier.
*/
struct io_kiocb *notif;
/*
* Default ->user_data for this slot notifiers CQEs
*/
u64 tag;
/*
* Notifiers of a slot live in generations, we create a new notifier
* only after flushing the previous one. Track the sequential number
* for all notifiers and copy it into notifiers's cqe->cflags
*/
u32 seq;
};
int io_notif_register(struct io_ring_ctx *ctx,
void __user *arg, unsigned int size);
int io_notif_unregister(struct io_ring_ctx *ctx);
void io_notif_slot_flush(struct io_notif_slot *slot);
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
struct io_notif_slot *slot);
static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
{ {
return io_kiocb_to_cmd(notif, struct io_notif_data); return io_kiocb_to_cmd(notif, struct io_notif_data);
} }
static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
struct io_notif_slot *slot)
{
if (!slot->notif)
slot->notif = io_alloc_notif(ctx, slot);
return slot->notif;
}
static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
unsigned idx)
__must_hold(&ctx->uring_lock)
{
if (idx >= ctx->nr_notif_slots)
return NULL;
idx = array_index_nospec(idx, ctx->nr_notif_slots);
return &ctx->notif_slots[idx];
}
static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot,
unsigned int issue_flags)
{
io_notif_slot_flush(slot);
}
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
{ {
struct io_ring_ctx *ctx = notif->ctx; struct io_ring_ctx *ctx = notif->ctx;
......
...@@ -246,13 +246,12 @@ const struct io_op_def io_op_defs[] = { ...@@ -246,13 +246,12 @@ const struct io_op_def io_op_defs[] = {
.prep = io_close_prep, .prep = io_close_prep,
.issue = io_close, .issue = io_close,
}, },
[IORING_OP_RSRC_UPDATE] = { [IORING_OP_FILES_UPDATE] = {
.audit_skip = 1, .audit_skip = 1,
.iopoll = 1, .iopoll = 1,
.name = "RSRC_UPDATE", .name = "FILES_UPDATE",
.prep = io_rsrc_update_prep, .prep = io_files_update_prep,
.issue = io_rsrc_update, .issue = io_files_update,
.ioprio = 1,
}, },
[IORING_OP_STATX] = { [IORING_OP_STATX] = {
.audit_skip = 1, .audit_skip = 1,
...@@ -471,7 +470,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -471,7 +470,7 @@ const struct io_op_def io_op_defs[] = {
.issue = io_uring_cmd, .issue = io_uring_cmd,
.prep_async = io_uring_cmd_prep_async, .prep_async = io_uring_cmd_prep_async,
}, },
[IORING_OP_SENDZC_NOTIF] = { [IORING_OP_SEND_ZC] = {
.name = "SENDZC_NOTIF", .name = "SENDZC_NOTIF",
.needs_file = 1, .needs_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
...@@ -484,6 +483,7 @@ const struct io_op_def io_op_defs[] = { ...@@ -484,6 +483,7 @@ const struct io_op_def io_op_defs[] = {
.prep = io_sendzc_prep, .prep = io_sendzc_prep,
.issue = io_sendzc, .issue = io_sendzc,
.prep_async = io_sendzc_prep_async, .prep_async = io_sendzc_prep_async,
.cleanup = io_sendzc_cleanup,
#else #else
.prep = io_eopnotsupp_prep, .prep = io_eopnotsupp_prep,
#endif #endif
......
...@@ -15,14 +15,12 @@ ...@@ -15,14 +15,12 @@
#include "io_uring.h" #include "io_uring.h"
#include "openclose.h" #include "openclose.h"
#include "rsrc.h" #include "rsrc.h"
#include "notif.h"
struct io_rsrc_update { struct io_rsrc_update {
struct file *file; struct file *file;
u64 arg; u64 arg;
u32 nr_args; u32 nr_args;
u32 offset; u32 offset;
int type;
}; };
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
...@@ -655,7 +653,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, ...@@ -655,7 +653,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
return -EINVAL; return -EINVAL;
} }
int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
...@@ -669,7 +667,6 @@ int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -669,7 +667,6 @@ int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (!up->nr_args) if (!up->nr_args)
return -EINVAL; return -EINVAL;
up->arg = READ_ONCE(sqe->addr); up->arg = READ_ONCE(sqe->addr);
up->type = READ_ONCE(sqe->ioprio);
return 0; return 0;
} }
...@@ -712,7 +709,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, ...@@ -712,7 +709,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
return ret; return ret;
} }
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update); struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
...@@ -741,54 +738,6 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) ...@@ -741,54 +738,6 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK; return IOU_OK;
} }
static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
struct io_ring_ctx *ctx = req->ctx;
unsigned len = up->nr_args;
unsigned idx_end, idx = up->offset;
int ret = 0;
io_ring_submit_lock(ctx, issue_flags);
if (unlikely(check_add_overflow(idx, len, &idx_end))) {
ret = -EOVERFLOW;
goto out;
}
if (unlikely(idx_end > ctx->nr_notif_slots)) {
ret = -EINVAL;
goto out;
}
for (; idx < idx_end; idx++) {
struct io_notif_slot *slot = &ctx->notif_slots[idx];
if (!slot->notif)
continue;
if (up->arg)
slot->tag = up->arg;
io_notif_slot_flush_submit(slot, issue_flags);
}
out:
io_ring_submit_unlock(ctx, issue_flags);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_rsrc_update *up = io_kiocb_to_cmd(req, struct io_rsrc_update);
switch (up->type) {
case IORING_RSRC_UPDATE_FILES:
return io_files_update(req, issue_flags);
case IORING_RSRC_UPDATE_NOTIF:
return io_notif_update(req, issue_flags);
}
return -EINVAL;
}
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
struct io_rsrc_node *node, void *rsrc) struct io_rsrc_node *node, void *rsrc)
{ {
......
...@@ -167,8 +167,8 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) ...@@ -167,8 +167,8 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
return &data->tags[table_idx][off]; return &data->tags[table_idx][off];
} }
int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int __io_account_mem(struct user_struct *user, unsigned long nr_pages); int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
......
...@@ -47,7 +47,6 @@ enum { ...@@ -47,7 +47,6 @@ enum {
MODE_MIXED = 3, MODE_MIXED = 3,
}; };
static bool cfg_flush = false;
static bool cfg_cork = false; static bool cfg_cork = false;
static int cfg_mode = MODE_ZC_FIXED; static int cfg_mode = MODE_ZC_FIXED;
static int cfg_nr_reqs = 8; static int cfg_nr_reqs = 8;
...@@ -166,21 +165,6 @@ static int io_uring_register_buffers(struct io_uring *ring, ...@@ -166,21 +165,6 @@ static int io_uring_register_buffers(struct io_uring *ring,
return (ret < 0) ? -errno : ret; return (ret < 0) ? -errno : ret;
} }
static int io_uring_register_notifications(struct io_uring *ring,
unsigned nr,
struct io_uring_notification_slot *slots)
{
int ret;
struct io_uring_notification_register r = {
.nr_slots = nr,
.data = (unsigned long)slots,
};
ret = syscall(__NR_io_uring_register, ring->ring_fd,
IORING_REGISTER_NOTIFIERS, &r, sizeof(r));
return (ret < 0) ? -errno : ret;
}
static int io_uring_mmap(int fd, struct io_uring_params *p, static int io_uring_mmap(int fd, struct io_uring_params *p,
struct io_uring_sq *sq, struct io_uring_cq *cq) struct io_uring_sq *sq, struct io_uring_cq *cq)
{ {
...@@ -297,11 +281,10 @@ static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, ...@@ -297,11 +281,10 @@ static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
const void *buf, size_t len, int flags, const void *buf, size_t len, int flags,
unsigned slot_idx, unsigned zc_flags) unsigned zc_flags)
{ {
io_uring_prep_send(sqe, sockfd, buf, len, flags); io_uring_prep_send(sqe, sockfd, buf, len, flags);
sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF; sqe->opcode = (__u8) IORING_OP_SEND_ZC;
sqe->notification_idx = slot_idx;
sqe->ioprio = zc_flags; sqe->ioprio = zc_flags;
} }
...@@ -374,7 +357,6 @@ static int do_setup_tx(int domain, int type, int protocol) ...@@ -374,7 +357,6 @@ static int do_setup_tx(int domain, int type, int protocol)
static void do_tx(int domain, int type, int protocol) static void do_tx(int domain, int type, int protocol)
{ {
struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}};
struct io_uring_sqe *sqe; struct io_uring_sqe *sqe;
struct io_uring_cqe *cqe; struct io_uring_cqe *cqe;
unsigned long packets = 0, bytes = 0; unsigned long packets = 0, bytes = 0;
...@@ -390,10 +372,6 @@ static void do_tx(int domain, int type, int protocol) ...@@ -390,10 +372,6 @@ static void do_tx(int domain, int type, int protocol)
if (ret) if (ret)
error(1, ret, "io_uring: queue init"); error(1, ret, "io_uring: queue init");
ret = io_uring_register_notifications(&ring, 1, b);
if (ret)
error(1, ret, "io_uring: tx ctx registration");
iov.iov_base = payload; iov.iov_base = payload;
iov.iov_len = cfg_payload_len; iov.iov_len = cfg_payload_len;
...@@ -409,9 +387,8 @@ static void do_tx(int domain, int type, int protocol) ...@@ -409,9 +387,8 @@ static void do_tx(int domain, int type, int protocol)
for (i = 0; i < cfg_nr_reqs; i++) { for (i = 0; i < cfg_nr_reqs; i++) {
unsigned zc_flags = 0; unsigned zc_flags = 0;
unsigned buf_idx = 0; unsigned buf_idx = 0;
unsigned slot_idx = 0;
unsigned mode = cfg_mode; unsigned mode = cfg_mode;
unsigned msg_flags = 0; unsigned msg_flags = MSG_WAITALL;
if (cfg_mode == MODE_MIXED) if (cfg_mode == MODE_MIXED)
mode = rand() % 3; mode = rand() % 3;
...@@ -423,13 +400,10 @@ static void do_tx(int domain, int type, int protocol) ...@@ -423,13 +400,10 @@ static void do_tx(int domain, int type, int protocol)
cfg_payload_len, msg_flags); cfg_payload_len, msg_flags);
sqe->user_data = NONZC_TAG; sqe->user_data = NONZC_TAG;
} else { } else {
if (cfg_flush) { compl_cqes++;
zc_flags |= IORING_RECVSEND_NOTIF_FLUSH;
compl_cqes++;
}
io_uring_prep_sendzc(sqe, fd, payload, io_uring_prep_sendzc(sqe, fd, payload,
cfg_payload_len, cfg_payload_len,
msg_flags, slot_idx, zc_flags); msg_flags, zc_flags);
if (mode == MODE_ZC_FIXED) { if (mode == MODE_ZC_FIXED) {
sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
sqe->buf_index = buf_idx; sqe->buf_index = buf_idx;
...@@ -442,51 +416,57 @@ static void do_tx(int domain, int type, int protocol) ...@@ -442,51 +416,57 @@ static void do_tx(int domain, int type, int protocol)
if (ret != cfg_nr_reqs) if (ret != cfg_nr_reqs)
error(1, ret, "submit"); error(1, ret, "submit");
if (cfg_cork)
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
for (i = 0; i < cfg_nr_reqs; i++) { for (i = 0; i < cfg_nr_reqs; i++) {
ret = io_uring_wait_cqe(&ring, &cqe); ret = io_uring_wait_cqe(&ring, &cqe);
if (ret) if (ret)
error(1, ret, "wait cqe"); error(1, ret, "wait cqe");
if (cqe->user_data == NOTIF_TAG) { if (cqe->user_data != NONZC_TAG &&
cqe->user_data != ZC_TAG)
error(1, -EINVAL, "invalid cqe->user_data");
if (cqe->flags & IORING_CQE_F_NOTIF) {
if (cqe->flags & IORING_CQE_F_MORE)
error(1, -EINVAL, "invalid notif flags");
compl_cqes--; compl_cqes--;
i--; i--;
} else if (cqe->user_data != NONZC_TAG && } else if (cqe->res <= 0) {
cqe->user_data != ZC_TAG) { if (cqe->flags & IORING_CQE_F_MORE)
error(1, cqe->res, "invalid user_data"); error(1, cqe->res, "more with a failed send");
} else if (cqe->res <= 0 && cqe->res != -EAGAIN) {
error(1, cqe->res, "send failed"); error(1, cqe->res, "send failed");
} else { } else {
if (cqe->res > 0) { if (cqe->user_data == ZC_TAG &&
packets++; !(cqe->flags & IORING_CQE_F_MORE))
bytes += cqe->res; error(1, cqe->res, "missing more flag");
} packets++;
/* failed requests don't flush */ bytes += cqe->res;
if (cfg_flush &&
cqe->res <= 0 &&
cqe->user_data == ZC_TAG)
compl_cqes--;
} }
io_uring_cqe_seen(&ring); io_uring_cqe_seen(&ring);
} }
if (cfg_cork)
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
} while (gettimeofday_ms() < tstop); } while (gettimeofday_ms() < tstop);
if (close(fd))
error(1, errno, "close");
fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
packets, bytes >> 20,
packets / (cfg_runtime_ms / 1000),
(bytes >> 20) / (cfg_runtime_ms / 1000));
while (compl_cqes) { while (compl_cqes) {
ret = io_uring_wait_cqe(&ring, &cqe); ret = io_uring_wait_cqe(&ring, &cqe);
if (ret) if (ret)
error(1, ret, "wait cqe"); error(1, ret, "wait cqe");
if (cqe->flags & IORING_CQE_F_MORE)
error(1, -EINVAL, "invalid notif flags");
if (!(cqe->flags & IORING_CQE_F_NOTIF))
error(1, -EINVAL, "missing notif flag");
io_uring_cqe_seen(&ring); io_uring_cqe_seen(&ring);
compl_cqes--; compl_cqes--;
} }
fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
packets, bytes >> 20,
packets / (cfg_runtime_ms / 1000),
(bytes >> 20) / (cfg_runtime_ms / 1000));
if (close(fd))
error(1, errno, "close");
} }
static void do_test(int domain, int type, int protocol) static void do_test(int domain, int type, int protocol)
...@@ -500,8 +480,8 @@ static void do_test(int domain, int type, int protocol) ...@@ -500,8 +480,8 @@ static void do_test(int domain, int type, int protocol)
static void usage(const char *filepath) static void usage(const char *filepath)
{ {
error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] " error(1, 0, "Usage: %s (-4|-6) (udp|tcp) -D<dst_ip> [-s<payload size>] "
"(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath); "[-t<time s>] [-n<batch>] [-p<port>] [-m<mode>]", filepath);
} }
static void parse_opts(int argc, char **argv) static void parse_opts(int argc, char **argv)
...@@ -519,7 +499,7 @@ static void parse_opts(int argc, char **argv) ...@@ -519,7 +499,7 @@ static void parse_opts(int argc, char **argv)
usage(argv[0]); usage(argv[0]);
cfg_payload_len = max_payload_len; cfg_payload_len = max_payload_len;
while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) { while ((c = getopt(argc, argv, "46D:p:s:t:n:c:m:")) != -1) {
switch (c) { switch (c) {
case '4': case '4':
if (cfg_family != PF_UNSPEC) if (cfg_family != PF_UNSPEC)
...@@ -548,9 +528,6 @@ static void parse_opts(int argc, char **argv) ...@@ -548,9 +528,6 @@ static void parse_opts(int argc, char **argv)
case 'n': case 'n':
cfg_nr_reqs = strtoul(optarg, NULL, 0); cfg_nr_reqs = strtoul(optarg, NULL, 0);
break; break;
case 'f':
cfg_flush = 1;
break;
case 'c': case 'c':
cfg_cork = strtol(optarg, NULL, 0); cfg_cork = strtol(optarg, NULL, 0);
break; break;
...@@ -583,8 +560,6 @@ static void parse_opts(int argc, char **argv) ...@@ -583,8 +560,6 @@ static void parse_opts(int argc, char **argv)
if (cfg_payload_len > max_payload_len) if (cfg_payload_len > max_payload_len)
error(1, 0, "-s: payload exceeds max (%d)", max_payload_len); error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
if (cfg_mode == MODE_NONZC && cfg_flush)
error(1, 0, "-f: only zerocopy modes support notifications");
if (optind != argc - 1) if (optind != argc - 1)
usage(argv[0]); usage(argv[0]);
} }
......
...@@ -25,15 +25,11 @@ readonly path_sysctl_mem="net.core.optmem_max" ...@@ -25,15 +25,11 @@ readonly path_sysctl_mem="net.core.optmem_max"
# No arguments: automated test # No arguments: automated test
if [[ "$#" -eq "0" ]]; then if [[ "$#" -eq "0" ]]; then
IPs=( "4" "6" ) IPs=( "4" "6" )
protocols=( "tcp" "udp" )
for IP in "${IPs[@]}"; do for IP in "${IPs[@]}"; do
for proto in "${protocols[@]}"; do for mode in $(seq 1 3); do
for mode in $(seq 1 3); do $0 "$IP" udp -m "$mode" -t 1 -n 32
$0 "$IP" "$proto" -m "$mode" -t 1 -n 32 $0 "$IP" tcp -m "$mode" -t 1 -n 32
$0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -f
$0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -c -f
done
done done
done done
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment