Commit 7ccc3ebf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:
 "A few fixes that should go into the 6.6-rc merge window:

   - Fix for a regression this merge window caused by the SQPOLL
     affinity patch, where we can race with SQPOLL thread shutdown and
     cause an oops when trying to set affinity (Gabriel)

   - Fix for a regression this merge window where fdinfo reading with
     for a ring setup with IORING_SETUP_NO_SQARRAY will attempt to
     deference the non-existing SQ ring array (me)

   - Add the patch that allows more finegrained control over who can use
     io_uring (Matteo)

   - Locking fix for a regression added this merge window for IOPOLL
     overflow (Pavel)

   - IOPOLL fix for stable, breaking our loop if helper threads are
     exiting (Pavel)

  Also had a fix for unreaped iopoll requests from io-wq from Ming, but
  we found an issue with that and hence it got reverted. Will get this
  sorted for a future rc"

* tag 'io_uring-6.6-2023-09-08' of git://git.kernel.dk/linux:
  Revert "io_uring: fix IO hang in io_wq_put_and_exit from do_exit()"
  io_uring: fix unprotected iopoll overflow
  io_uring: break out of iowq iopoll on teardown
  io_uring: add a sysctl to disable io_uring system-wide
  io_uring/fdinfo: only print ->sq_array[] if it's there
  io_uring: fix IO hang in io_wq_put_and_exit from do_exit()
  io_uring: Don't set affinity on a dying sqpoll thread
parents 32bf43e4 023464fe
......@@ -450,6 +450,35 @@ this allows system administrators to override the
``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.
io_uring_disabled
=================
Prevents all processes from creating new io_uring instances. Enabling this
shrinks the kernel's attack surface.
= ======================================================================
0 All processes can create io_uring instances as normal. This is the
default setting.
1 io_uring creation is disabled (io_uring_setup() will fail with
-EPERM) for unprivileged processes not in the io_uring_group group.
Existing io_uring instances can still be used. See the
documentation for io_uring_group for more information.
2 io_uring creation is disabled for all processes. io_uring_setup()
always fails with -EPERM. Existing io_uring instances can still be
used.
= ======================================================================
io_uring_group
==============
When io_uring_disabled is set to 1, a process must either be
privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
to create an io_uring instance. If io_uring_group is set to -1 (the
default), only processes with the CAP_SYS_ADMIN capability may create
io_uring instances.
kexec_load_disabled
===================
......
......@@ -93,6 +93,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
struct io_uring_sqe *sqe;
unsigned int sq_idx;
if (ctx->flags & IORING_SETUP_NO_SQARRAY)
break;
sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]);
if (sq_idx > sq_mask)
continue;
......
......@@ -174,6 +174,16 @@ static void io_worker_ref_put(struct io_wq *wq)
complete(&wq->worker_done);
}
bool io_wq_worker_stopped(void)
{
struct io_worker *worker = current->worker_private;
if (WARN_ON_ONCE(!io_wq_current_is_worker()))
return true;
return test_bit(IO_WQ_BIT_EXIT, &worker->wq->state);
}
static void io_worker_cancel_cb(struct io_worker *worker)
{
struct io_wq_acct *acct = io_wq_get_acct(worker);
......
......@@ -52,6 +52,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val);
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask);
int io_wq_max_workers(struct io_wq *wq, int *new_count);
bool io_wq_worker_stopped(void);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
......
......@@ -150,6 +150,31 @@ static void io_queue_sqe(struct io_kiocb *req);
struct kmem_cache *req_cachep;
static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
#ifdef CONFIG_SYSCTL
static struct ctl_table kernel_io_uring_disabled_table[] = {
{
.procname = "io_uring_disabled",
.data = &sysctl_io_uring_disabled,
.maxlen = sizeof(sysctl_io_uring_disabled),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
{
.procname = "io_uring_group",
.data = &sysctl_io_uring_group,
.maxlen = sizeof(gid_t),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{},
};
#endif
struct sock *io_uring_get_socket(struct file *file)
{
#if defined(CONFIG_UNIX)
......@@ -883,7 +908,7 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx)
struct io_uring_cqe *cqe = &ctx->completion_cqes[i];
if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) {
if (ctx->task_complete) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
io_cqring_event_overflow(ctx, cqe->user_data,
cqe->res, cqe->flags, 0, 0);
......@@ -1541,7 +1566,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
if (!(req->flags & REQ_F_CQE_SKIP) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->task_complete) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
io_req_cqe_overflow(req);
spin_unlock(&ctx->completion_lock);
......@@ -1950,6 +1975,8 @@ void io_wq_submit_work(struct io_wq_work *work)
if (!needs_poll) {
if (!(req->ctx->flags & IORING_SETUP_IOPOLL))
break;
if (io_wq_worker_stopped())
break;
cond_resched();
continue;
}
......@@ -4038,9 +4065,30 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
return io_uring_create(entries, &p, params);
}
static inline bool io_uring_allowed(void)
{
int disabled = READ_ONCE(sysctl_io_uring_disabled);
kgid_t io_uring_group;
if (disabled == 2)
return false;
if (disabled == 0 || capable(CAP_SYS_ADMIN))
return true;
io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
if (!gid_valid(io_uring_group))
return false;
return in_group_p(io_uring_group);
}
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
struct io_uring_params __user *, params)
{
if (!io_uring_allowed())
return -EPERM;
return io_uring_setup(entries, params);
}
......@@ -4634,6 +4682,10 @@ static int __init io_uring_init(void)
offsetof(struct io_kiocb, cmd.data),
sizeof_field(struct io_kiocb, cmd.data), NULL);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif
return 0;
};
__initcall(io_uring_init);
......@@ -430,7 +430,9 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
if (sqd) {
io_sq_thread_park(sqd);
ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
/* Don't set affinity for a dying thread */
if (sqd->thread)
ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
io_sq_thread_unpark(sqd);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment