Commit 3147a068 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:
 "Mostly just a set of fixes in here, or little changes that didn't get
  included in the initial pull request. This contains:

   - Move the SQPOLL napi polling outside the submission lock (Olivier)

   - Rename of the "copy buffers" API that got added in the 6.12 merge
     window. There's really no copying going on, it's just referencing
     the buffers. After a bit of consideration, decided that it was
     better to simply rename this to avoid potential confusion (me)

   - Shrink struct io_mapped_ubuf from 48 to 32 bytes, by changing it to
     start + len tracking rather than having start / end in there, and
     by removing the caching of folio_mask when we can just calculate it
     from folio_shift when we need it (me)

   - Fixes for the SQPOLL affinity checking (me, Felix)

   - Fix for how cqring waiting checks for the presence of task_work.
     Just check it directly rather than check for a specific
     notification mechanism (me)

   - Tweak to how request linking is represented in tracing (me)

   - Fix a syzbot report that deliberately sets up a huge list of
     overflow entries, and then hits rcu stalls when flushing this list.
     Just check for the need to preempt, and drop/reacquire locks in the
     loop. There's no state maintained over the loop itself, and each
     entry is yanked from head-of-list (me)"

* tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux:
  io_uring: check if we need to reschedule during overflow flush
  io_uring: improve request linking trace
  io_uring: check for presence of task_work rather than TIF_NOTIFY_SIGNAL
  io_uring/sqpoll: do the napi busy poll outside the submission block
  io_uring: clean up a type in io_uring_register_get_file()
  io_uring/sqpoll: do not put cpumask on stack
  io_uring/sqpoll: retain test for whether the CPU is valid
  io_uring/rsrc: change ubuf->ubuf_end to length tracking
  io_uring/rsrc: get rid of io_mapped_ubuf->folio_mask
  io_uring: rename "copy buffers" to "clone buffers"
parents 172d5139 eac2ca2d
...@@ -609,8 +609,8 @@ enum io_uring_register_op { ...@@ -609,8 +609,8 @@ enum io_uring_register_op {
IORING_REGISTER_CLOCK = 29, IORING_REGISTER_CLOCK = 29,
/* copy registered buffers from source ring to current ring */ /* clone registered buffers from source ring to current ring */
IORING_REGISTER_COPY_BUFFERS = 30, IORING_REGISTER_CLONE_BUFFERS = 30,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST, IORING_REGISTER_LAST,
...@@ -701,7 +701,7 @@ enum { ...@@ -701,7 +701,7 @@ enum {
IORING_REGISTER_SRC_REGISTERED = 1, IORING_REGISTER_SRC_REGISTERED = 1,
}; };
struct io_uring_copy_buffers { struct io_uring_clone_buffers {
__u32 src_fd; __u32 src_fd;
__u32 flags; __u32 flags;
__u32 pad[6]; __u32 pad[6];
......
...@@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) ...@@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
struct io_mapped_ubuf *buf = ctx->user_bufs[i]; struct io_mapped_ubuf *buf = ctx->user_bufs[i];
unsigned int len = buf->ubuf_end - buf->ubuf;
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
} }
if (has_lock && !xa_empty(&ctx->personalities)) { if (has_lock && !xa_empty(&ctx->personalities)) {
unsigned long index; unsigned long index;
......
...@@ -635,6 +635,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) ...@@ -635,6 +635,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
} }
list_del(&ocqe->list); list_del(&ocqe->list);
kfree(ocqe); kfree(ocqe);
/*
* For silly syzbot cases that deliberately overflow by huge
* amounts, check if we need to resched and drop and
* reacquire the locks if so. Nothing real would ever hit this.
* Ideally we'd have a non-posting unlock for this, but hard
* to care for a non-real case.
*/
if (need_resched()) {
io_cq_unlock_post(ctx);
mutex_unlock(&ctx->uring_lock);
cond_resched();
mutex_lock(&ctx->uring_lock);
io_cq_lock(ctx);
}
} }
if (list_empty(&ctx->cq_overflow_list)) { if (list_empty(&ctx->cq_overflow_list)) {
...@@ -2164,7 +2179,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -2164,7 +2179,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
* conditions are true (normal request), then just queue it. * conditions are true (normal request), then just queue it.
*/ */
if (unlikely(link->head)) { if (unlikely(link->head)) {
trace_io_uring_link(req, link->head); trace_io_uring_link(req, link->last);
link->last->link = req; link->last->link = req;
link->last = req; link->last = req;
...@@ -2472,7 +2487,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, ...@@ -2472,7 +2487,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
return 1; return 1;
if (unlikely(!llist_empty(&ctx->work_llist))) if (unlikely(!llist_empty(&ctx->work_llist)))
return 1; return 1;
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) if (unlikely(task_work_pending(current)))
return 1; return 1;
if (unlikely(task_sigpending(current))) if (unlikely(task_sigpending(current)))
return -EINTR; return -EINTR;
...@@ -2579,9 +2594,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, ...@@ -2579,9 +2594,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
* If we got woken because of task_work being processed, run it * If we got woken because of task_work being processed, run it
* now rather than let the caller do another wait loop. * now rather than let the caller do another wait loop.
*/ */
io_run_task_work();
if (!llist_empty(&ctx->work_llist)) if (!llist_empty(&ctx->work_llist))
io_run_local_work(ctx, nr_wait); io_run_local_work(ctx, nr_wait);
io_run_task_work();
/* /*
* Non-local task_work will be run on exit to userspace, but * Non-local task_work will be run on exit to userspace, but
......
...@@ -542,11 +542,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -542,11 +542,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break; break;
ret = io_register_clock(ctx, arg); ret = io_register_clock(ctx, arg);
break; break;
case IORING_REGISTER_COPY_BUFFERS: case IORING_REGISTER_CLONE_BUFFERS:
ret = -EINVAL; ret = -EINVAL;
if (!arg || nr_args != 1) if (!arg || nr_args != 1)
break; break;
ret = io_register_copy_buffers(ctx, arg); ret = io_register_clone_buffers(ctx, arg);
break; break;
default: default:
ret = -EINVAL; ret = -EINVAL;
...@@ -561,7 +561,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -561,7 +561,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
* true, then the registered index is used. Otherwise, the normal fd table. * true, then the registered index is used. Otherwise, the normal fd table.
* Caller must call fput() on the returned file, unless it's an ERR_PTR. * Caller must call fput() on the returned file, unless it's an ERR_PTR.
*/ */
struct file *io_uring_register_get_file(int fd, bool registered) struct file *io_uring_register_get_file(unsigned int fd, bool registered)
{ {
struct file *file; struct file *file;
......
...@@ -4,6 +4,6 @@ ...@@ -4,6 +4,6 @@
int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
struct file *io_uring_register_get_file(int fd, bool registered); struct file *io_uring_register_get_file(unsigned int fd, bool registered);
#endif #endif
...@@ -38,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -38,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
static const struct io_mapped_ubuf dummy_ubuf = { static const struct io_mapped_ubuf dummy_ubuf = {
/* set invalid range, so io_import_fixed() fails meeting it */ /* set invalid range, so io_import_fixed() fails meeting it */
.ubuf = -1UL, .ubuf = -1UL,
.ubuf_end = 0, .len = UINT_MAX,
}; };
int __io_account_mem(struct user_struct *user, unsigned long nr_pages) int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
...@@ -991,16 +991,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -991,16 +991,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
size = iov->iov_len; size = iov->iov_len;
/* store original address for later verification */ /* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len; imu->len = iov->iov_len;
imu->nr_bvecs = nr_pages; imu->nr_bvecs = nr_pages;
imu->folio_shift = PAGE_SHIFT; imu->folio_shift = PAGE_SHIFT;
imu->folio_mask = PAGE_MASK; if (coalesced)
if (coalesced) {
imu->folio_shift = data.folio_shift; imu->folio_shift = data.folio_shift;
imu->folio_mask = ~((1UL << data.folio_shift) - 1);
}
refcount_set(&imu->refs, 1); refcount_set(&imu->refs, 1);
off = (unsigned long) iov->iov_base & ~imu->folio_mask; off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
*pimu = imu; *pimu = imu;
ret = 0; ret = 0;
...@@ -1100,7 +1097,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -1100,7 +1097,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
return -EFAULT; return -EFAULT;
/* not inside the mapped region */ /* not inside the mapped region */
if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
return -EFAULT; return -EFAULT;
/* /*
...@@ -1143,14 +1140,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -1143,14 +1140,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
iter->bvec = bvec + seg_skip; iter->bvec = bvec + seg_skip;
iter->nr_segs -= seg_skip; iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset; iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ~imu->folio_mask; iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
} }
} }
return 0; return 0;
} }
static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
{ {
struct io_mapped_ubuf **user_bufs; struct io_mapped_ubuf **user_bufs;
struct io_rsrc_data *data; struct io_rsrc_data *data;
...@@ -1214,9 +1211,9 @@ static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) ...@@ -1214,9 +1211,9 @@ static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
* *
* Since the memory is already accounted once, don't account it again. * Since the memory is already accounted once, don't account it again.
*/ */
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
{ {
struct io_uring_copy_buffers buf; struct io_uring_clone_buffers buf;
bool registered_src; bool registered_src;
struct file *file; struct file *file;
int ret; int ret;
...@@ -1234,7 +1231,7 @@ int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg) ...@@ -1234,7 +1231,7 @@ int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
file = io_uring_register_get_file(buf.src_fd, registered_src); file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file)) if (IS_ERR(file))
return PTR_ERR(file); return PTR_ERR(file);
ret = io_copy_buffers(ctx, file->private_data); ret = io_clone_buffers(ctx, file->private_data);
if (!registered_src) if (!registered_src)
fput(file); fput(file);
return ret; return ret;
......
...@@ -42,12 +42,11 @@ struct io_rsrc_node { ...@@ -42,12 +42,11 @@ struct io_rsrc_node {
struct io_mapped_ubuf { struct io_mapped_ubuf {
u64 ubuf; u64 ubuf;
u64 ubuf_end; unsigned int len;
unsigned int nr_bvecs; unsigned int nr_bvecs;
unsigned int folio_shift; unsigned int folio_shift;
unsigned long acct_pages;
unsigned long folio_mask;
refcount_t refs; refcount_t refs;
unsigned long acct_pages;
struct bio_vec bvec[] __counted_by(nr_bvecs); struct bio_vec bvec[] __counted_by(nr_bvecs);
}; };
...@@ -68,7 +67,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -68,7 +67,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu, struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len); u64 buf_addr, size_t len);
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
......
...@@ -196,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ...@@ -196,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
ret = io_submit_sqes(ctx, to_submit); ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
if (io_napi(ctx))
ret += io_napi_sqpoll_busy_poll(ctx);
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait); wake_up(&ctx->sqo_sq_wait);
if (creds) if (creds)
...@@ -323,6 +320,10 @@ static int io_sq_thread(void *data) ...@@ -323,6 +320,10 @@ static int io_sq_thread(void *data)
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE)) if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
sqt_spin = true; sqt_spin = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
if (io_napi(ctx))
io_napi_sqpoll_busy_poll(ctx);
if (sqt_spin || !time_after(jiffies, timeout)) { if (sqt_spin || !time_after(jiffies, timeout)) {
if (sqt_spin) { if (sqt_spin) {
io_sq_update_worktime(sqd, &start); io_sq_update_worktime(sqd, &start);
...@@ -461,13 +462,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ...@@ -461,13 +462,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
return 0; return 0;
if (p->flags & IORING_SETUP_SQ_AFF) { if (p->flags & IORING_SETUP_SQ_AFF) {
struct cpumask allowed_mask; cpumask_var_t allowed_mask;
int cpu = p->sq_thread_cpu; int cpu = p->sq_thread_cpu;
ret = -EINVAL; ret = -EINVAL;
cpuset_cpus_allowed(current, &allowed_mask); if (cpu >= nr_cpu_ids || !cpu_online(cpu))
if (!cpumask_test_cpu(cpu, &allowed_mask)) goto err_sqpoll;
ret = -ENOMEM;
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
goto err_sqpoll;
ret = -EINVAL;
cpuset_cpus_allowed(current, allowed_mask);
if (!cpumask_test_cpu(cpu, allowed_mask)) {
free_cpumask_var(allowed_mask);
goto err_sqpoll; goto err_sqpoll;
}
free_cpumask_var(allowed_mask);
sqd->sq_cpu = cpu; sqd->sq_cpu = cpu;
} else { } else {
sqd->sq_cpu = -1; sqd->sq_cpu = -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment