Commit 5695e516 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-worker.v3-2021-02-25' of git://git.kernel.dk/linux-block

Pull io_uring thread rewrite from Jens Axboe:
 "This converts the io-wq workers to be forked off the tasks in question
  instead of being kernel threads that assume various bits of the
  original task identity.

  This kills > 400 lines of code from io_uring/io-wq, and it's the worst
  part of the code. We've had several bugs in this area, and the worry
  is always that we could be missing some pieces for file types doing
  unusual things (recent /dev/tty example comes to mind, userfaultfd
  reads installing file descriptors is another fun one... - both of
  which need special handling, and I bet it's not the last weird oddity
  we'll find).

  With these identical workers, we can have full confidence that we're
  never missing anything. That, in itself, is a huge win. Outside of
  that, it's also more efficient since we're not wasting space and code
  on tracking state, or switching between different states.

  I'm sure we're going to find little things to patch up after this
  series, but testing has been pretty thorough, from the usual
  regression suite to production. Any issue that may crop up should be
  manageable.

  There's also a nice series of further reductions we can do on top of
  this, but I wanted to get the meat of it out sooner rather than later.
  The general worry here isn't that it's fundamentally broken. Most of
  the little issues we've found over the last week have been related to
  just changes in how thread startup/exit is done, since that's the main
  difference between using kthreads and these kinds of threads. In fact,
  if all goes according to plan, I want to get this into the 5.10 and
  5.11 stable branches as well.

  That said, the changes outside of io_uring/io-wq are:

   - arch setup, simple one-liner to each arch copy_thread()
     implementation.

   - Removal of net and proc restrictions for io_uring, they are no
     longer needed or useful"

* tag 'io_uring-worker.v3-2021-02-25' of git://git.kernel.dk/linux-block: (30 commits)
  io-wq: remove now unused IO_WQ_BIT_ERROR
  io_uring: fix SQPOLL thread handling over exec
  io-wq: improve manager/worker handling over exec
  io_uring: ensure SQPOLL startup is triggered before error shutdown
  io-wq: make buffered file write hashed work map per-ctx
  io-wq: fix race around io_worker grabbing
  io-wq: fix races around manager/worker creation and task exit
  io_uring: ensure io-wq context is always destroyed for tasks
  arch: ensure parisc/powerpc handle PF_IO_WORKER in copy_thread()
  io_uring: cleanup ->user usage
  io-wq: remove nr_process accounting
  io_uring: flag new native workers with IORING_FEAT_NATIVE_WORKERS
  net: remove cmsg restriction from io_uring based send/recvmsg calls
  Revert "proc: don't allow async path resolution of /proc/self components"
  Revert "proc: don't allow async path resolution of /proc/thread-self components"
  io_uring: move SQPOLL thread io-wq forked worker
  io-wq: make io_wq_fork_thread() available to other users
  io-wq: only remove worker from free_list, if it was there
  io_uring: remove io_identity
  io_uring: remove any grabbing of context
  ...
parents 5ceabb60 d6ce7f67
......@@ -249,7 +249,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
childti->pcb.ksp = (unsigned long) childstack;
childti->pcb.flags = 1; /* set FEN, clear everything else */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(childstack, 0,
sizeof(struct switch_stack) + sizeof(struct pt_regs));
......
......@@ -191,7 +191,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
childksp[0] = 0; /* fp */
childksp[1] = (unsigned long)ret_from_fork; /* blink */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(c_regs, 0, sizeof(struct pt_regs));
c_callee->r13 = kthread_arg;
......
......@@ -243,7 +243,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
thread->cpu_domain = get_domain();
#endif
if (likely(!(p->flags & PF_KTHREAD))) {
if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) {
*childregs = *current_pt_regs();
childregs->ARM_r0 = 0;
if (stack_start)
......
......@@ -398,7 +398,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
ptrauth_thread_init_kernel(p);
if (likely(!(p->flags & PF_KTHREAD))) {
if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) {
*childregs = *current_pt_regs();
childregs->regs[0] = 0;
......
......@@ -49,7 +49,7 @@ int copy_thread(unsigned long clone_flags,
/* setup thread.sp for switch_to !!! */
p->thread.sp = (unsigned long)childstack;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
childstack->r15 = (unsigned long) ret_from_kernel_thread;
childstack->r10 = kthread_arg;
......
......@@ -112,7 +112,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
childregs = (struct pt_regs *) (THREAD_SIZE + task_stack_page(p)) - 1;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
childregs->retpc = (unsigned long) ret_from_kernel_thread;
childregs->er4 = topstk; /* arg */
......
......@@ -73,7 +73,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
sizeof(*ss));
ss->lr = (unsigned long)ret_from_fork;
p->thread.switch_sp = ss;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
/* r24 <- fn, r25 <- arg */
ss->r24 = usp;
......
......@@ -338,7 +338,7 @@ copy_thread(unsigned long clone_flags, unsigned long user_stack_base,
ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
if (unlikely(!user_stack_base)) {
/* fork_idle() called us */
return 0;
......
......@@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
*/
p->thread.fs = get_fs().seg;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(frame, 0, sizeof(struct fork_frame));
frame->regs.sr = PS_S;
......
......@@ -59,7 +59,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
struct pt_regs *childregs = task_pt_regs(p);
struct thread_info *ti = task_thread_info(p);
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* if we're creating a new kernel thread then just zeroing all
* the registers. That's OK for a brand new thread.*/
memset(childregs, 0, sizeof(struct pt_regs));
......
......@@ -120,7 +120,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
/* Put the stack after the struct pt_regs. */
childksp = (unsigned long) childregs;
p->thread.cp0_status = (read_c0_status() & ~(ST0_CU2|ST0_CU1)) | ST0_KERNEL_CUMASK;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
unsigned long status = p->thread.cp0_status;
memset(childregs, 0, sizeof(struct pt_regs));
......
......@@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start,
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
/* kernel thread fn */
p->thread.cpu_context.r6 = stack_start;
......
......@@ -109,7 +109,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
struct switch_stack *childstack =
((struct switch_stack *)childregs) - 1;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childstack, 0,
sizeof(struct switch_stack) + sizeof(struct pt_regs));
......
......@@ -174,7 +174,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
sp -= sizeof(struct pt_regs);
kregs = (struct pt_regs *)sp;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(kregs, 0, sizeof(struct pt_regs));
kregs->gpr[20] = usp; /* fn, kernel thread */
kregs->gpr[22] = arg;
......
......@@ -200,7 +200,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
extern void * const ret_from_kernel_thread;
extern void * const child_return;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(cregs, 0, sizeof(struct pt_regs));
if (!usp) /* idle thread */
......
......@@ -1670,7 +1670,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
/* Copy registers */
sp -= sizeof(struct pt_regs);
childregs = (struct pt_regs *) sp;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
childregs->gpr[1] = sp + sizeof(struct pt_regs);
......
......@@ -124,7 +124,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
struct pt_regs *childregs = task_pt_regs(p);
/* p->thread holds context to be restored by __switch_to() */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* Kernel thread */
memset(childregs, 0, sizeof(struct pt_regs));
childregs->gp = gp_in_global;
......
......@@ -130,7 +130,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
frame->sf.gprs[9] = (unsigned long)frame;
/* Store access registers to kernel stack of new process. */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
/* kernel thread */
memset(&frame->childregs, 0, sizeof(struct pt_regs));
frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT |
......
......@@ -114,7 +114,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg,
childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
p->thread.pc = (unsigned long) ret_from_kernel_thread;
childregs->regs[4] = arg;
......
......@@ -309,7 +309,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
ti->ksp = (unsigned long) new_stack;
p->thread.kregs = childregs;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
extern int nwindows;
unsigned long psr;
memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ);
......
......@@ -597,7 +597,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
sizeof(struct sparc_stackf));
t->fpsaved[0] = 0;
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(child_trap_frame, 0, child_stack_sz);
__thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] =
(current_pt_regs()->tstate + 1) & TSTATE_CWP;
......
......@@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
unsigned long arg, struct task_struct * p, unsigned long tls)
{
void (*handler)(void);
int kthread = current->flags & PF_KTHREAD;
int kthread = current->flags & (PF_KTHREAD | PF_IO_WORKER);
int ret = 0;
p->thread = (struct thread_struct) INIT_THREAD;
......
......@@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
#endif
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;
......
......@@ -217,7 +217,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn,
p->thread.sp = (unsigned long)childregs;
if (!(p->flags & PF_KTHREAD)) {
if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
struct pt_regs *regs = current_pt_regs();
unsigned long usp = usp_thread_fn ?
usp_thread_fn : regs->areg[1];
......
This diff is collapsed.
#ifndef INTERNAL_IO_WQ_H
#define INTERNAL_IO_WQ_H
#include <linux/refcount.h>
#include <linux/io_uring.h>
struct io_wq;
......@@ -11,13 +12,6 @@ enum {
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
IO_WQ_WORK_FS = 64,
IO_WQ_WORK_MM = 128,
IO_WQ_WORK_CREDS = 256,
IO_WQ_WORK_BLKCG = 512,
IO_WQ_WORK_FSIZE = 1024,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
......@@ -85,7 +79,7 @@ static inline void wq_list_del(struct io_wq_work_list *list,
struct io_wq_work {
struct io_wq_work_node list;
struct io_identity *identity;
const struct cred *creds;
unsigned flags;
};
......@@ -100,20 +94,32 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
struct io_wq_hash {
refcount_t refs;
unsigned long map;
struct wait_queue_head wait;
};
static inline void io_wq_put_hash(struct io_wq_hash *hash)
{
if (refcount_dec_and_test(&hash->refs))
kfree(hash);
}
struct io_wq_data {
struct io_wq_hash *hash;
io_wq_work_fn *do_work;
free_work_fn *free_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_put(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
void io_wq_hash_work(struct io_wq_work *work, void *val);
pid_t io_wq_fork_thread(int (*fn)(void *), void *arg);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
......@@ -124,8 +130,6 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel,
void *data, bool cancel_all);
struct task_struct *io_wq_get_task(struct io_wq *wq);
#if defined(CONFIG_IO_WQ)
extern void io_wq_worker_sleeping(struct task_struct *);
extern void io_wq_worker_running(struct task_struct *);
......@@ -140,6 +144,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk)
static inline bool io_wq_current_is_worker(void)
{
return in_task() && (current->flags & PF_IO_WORKER);
return in_task() && (current->flags & PF_IO_WORKER) &&
current->pf_io_worker;
}
#endif
This diff is collapsed.
......@@ -16,13 +16,6 @@ static const char *proc_self_get_link(struct dentry *dentry,
pid_t tgid = task_tgid_nr_ns(current, ns);
char *name;
/*
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!tgid)
return ERR_PTR(-ENOENT);
/* max length of unsigned int in decimal + NULL term */
......
......@@ -17,13 +17,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
/*
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!pid)
return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
......
......@@ -5,23 +5,6 @@
#include <linux/sched.h>
#include <linux/xarray.h>
struct io_identity {
struct files_struct *files;
struct mm_struct *mm;
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
const struct cred *creds;
struct nsproxy *nsproxy;
struct fs_struct *fs;
unsigned long fsize;
#ifdef CONFIG_AUDIT
kuid_t loginuid;
unsigned int sessionid;
#endif
refcount_t count;
};
struct io_wq_work_node {
struct io_wq_work_node *next;
};
......@@ -36,9 +19,8 @@ struct io_uring_task {
struct xarray xa;
struct wait_queue_head wait;
struct file *last;
void *io_wq;
struct percpu_counter inflight;
struct io_identity __identity;
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
......@@ -61,7 +43,7 @@ static inline void io_uring_task_cancel(void)
}
static inline void io_uring_files_cancel(struct files_struct *files)
{
if (current->io_uring && !xa_empty(&current->io_uring->xa))
if (current->io_uring)
__io_uring_files_cancel(files);
}
static inline void io_uring_free(struct task_struct *tsk)
......
......@@ -42,8 +42,6 @@ struct net;
#define SOCK_PASSCRED 3
#define SOCK_PASSSEC 4
#define PROTO_CMSG_DATA_ONLY 0x0001
#ifndef ARCH_HAS_SOCKET_TYPES
/**
* enum sock_type - Socket types
......@@ -138,7 +136,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *,
struct proto_ops {
int family;
unsigned int flags;
struct module *owner;
int (*release) (struct socket *sock);
int (*bind) (struct socket *sock,
......
......@@ -895,6 +895,9 @@ struct task_struct {
/* CLONE_CHILD_CLEARTID: */
int __user *clear_child_tid;
/* PF_IO_WORKER */
void *pf_io_worker;
u64 utime;
u64 stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
......
......@@ -262,6 +262,7 @@ struct io_uring_params {
#define IORING_FEAT_POLL_32BITS (1U << 6)
#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7)
#define IORING_FEAT_EXT_ARG (1U << 8)
#define IORING_FEAT_NATIVE_WORKERS (1U << 9)
/*
* io_uring_register(2) opcodes and arguments
......
......@@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request,
audit_ptrace(task);
retval = -EPERM;
if (unlikely(task->flags & PF_KTHREAD))
if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER)))
goto out;
if (same_thread_group(task, current))
goto out;
......
......@@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
return true;
/* Only allow kernel generated signals to this kthread */
if (unlikely((t->flags & PF_KTHREAD) &&
if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) &&
(handler == SIG_KTHREAD_KERNEL) && !force))
return true;
......@@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
/*
* Skip useless siginfo allocation for SIGKILL and kernel threads.
*/
if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER)))
goto out_set;
/*
......
......@@ -1021,7 +1021,6 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet_release,
.bind = inet_bind,
......
......@@ -665,7 +665,6 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
.flags = PROTO_CMSG_DATA_ONLY,
.owner = THIS_MODULE,
.release = inet6_release,
.bind = inet6_bind,
......
......@@ -2413,10 +2413,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags)
{
/* disallow ancillary data requests from this path */
if (msg->msg_control || msg->msg_controllen)
return -EINVAL;
return ____sys_sendmsg(sock, msg, flags, NULL, 0);
}
......@@ -2625,12 +2621,6 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
struct user_msghdr __user *umsg,
struct sockaddr __user *uaddr, unsigned int flags)
{
if (msg->msg_control || msg->msg_controllen) {
/* disallow ancillary data reqs unless cmsg is plain data */
if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY))
return -EINVAL;
}
return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment