Commit 5bbb336b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.12/io_uring-2021-02-17' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Highlights from this cycles are things like request recycling and
  task_work optimizations, which net us anywhere from 10-20% of speedups
  on workloads that mostly are inline.

  This work was originally done to put io_uring under memcg, which adds
  considerable overhead. But it's a really nice win as well. Also worth
  highlighting is the LOOKUP_CACHED work in the VFS, and using it in
  io_uring. Greatly speeds up the fast path for file opens.

  Summary:

   - Put io_uring under memcg protection. We accounted just the rings
     themselves under rlimit memlock before, now we account everything.

   - Request cache recycling, persistent across invocations (Pavel, me)

   - First part of a cleanup/improvement to buffer registration (Bijan)

   - SQPOLL fixes (Hao)

   - File registration NULL pointer fixup (Dan)

   - LOOKUP_CACHED support for io_uring

   - Disable /proc/thread-self/ for io_uring, like we do for /proc/self

   - Add Pavel to the io_uring MAINTAINERS entry

   - Tons of code cleanups and optimizations (Pavel)

   - Support for skip entries in file registration (Noah)"

* tag 'for-5.12/io_uring-2021-02-17' of git://git.kernel.dk/linux-block: (103 commits)
  io_uring: tctx->task_lock should be IRQ safe
  proc: don't allow async path resolution of /proc/thread-self components
  io_uring: kill cached requests from exiting task closing the ring
  io_uring: add helper to free all request caches
  io_uring: allow task match to be passed to io_req_cache_free()
  io-wq: clear out worker ->fs and ->files
  io_uring: optimise io_init_req() flags setting
  io_uring: clean io_req_find_next() fast check
  io_uring: don't check PF_EXITING from syscall
  io_uring: don't split out consume out of SQE get
  io_uring: save ctx put/get for task_work submit
  io_uring: don't duplicate io_req_task_queue()
  io_uring: optimise SQPOLL mm/files grabbing
  io_uring: optimise out unlikely link queue
  io_uring: take compl state from submit state
  io_uring: inline io_complete_rw_common()
  io_uring: move res check out of io_rw_reissue()
  io_uring: simplify iopoll reissuing
  io_uring: clean up io_req_free_batch_finish()
  io_uring: move submit side state closer in the ring
  ...
parents 9820b4dc 0b81e80c
...@@ -6830,6 +6830,9 @@ F: include/linux/fs.h ...@@ -6830,6 +6830,9 @@ F: include/linux/fs.h
F: include/linux/fs_types.h F: include/linux/fs_types.h
F: include/uapi/linux/fs.h F: include/uapi/linux/fs.h
F: include/uapi/linux/openat2.h F: include/uapi/linux/openat2.h
X: fs/io-wq.c
X: fs/io-wq.h
X: fs/io_uring.c
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
M: Riku Voipio <riku.voipio@iki.fi> M: Riku Voipio <riku.voipio@iki.fi>
...@@ -9263,6 +9266,7 @@ F: include/uapi/linux/iommu.h ...@@ -9263,6 +9266,7 @@ F: include/uapi/linux/iommu.h
IO_URING IO_URING
M: Jens Axboe <axboe@kernel.dk> M: Jens Axboe <axboe@kernel.dk>
R: Pavel Begunkov <asml.silence@gmail.com>
L: io-uring@vger.kernel.org L: io-uring@vger.kernel.org
S: Maintained S: Maintained
T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/linux-block
...@@ -9270,6 +9274,7 @@ T: git git://git.kernel.dk/liburing ...@@ -9270,6 +9274,7 @@ T: git git://git.kernel.dk/liburing
F: fs/io-wq.c F: fs/io-wq.c
F: fs/io-wq.h F: fs/io-wq.h
F: fs/io_uring.c F: fs/io_uring.c
F: include/linux/io_uring.h
F: include/uapi/linux/io_uring.h F: include/uapi/linux/io_uring.h
IPMI SUBSYSTEM IPMI SUBSYSTEM
......
...@@ -22,6 +22,8 @@ ...@@ -22,6 +22,8 @@
#include <linux/close_range.h> #include <linux/close_range.h>
#include <net/sock.h> #include <net/sock.h>
#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG; unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */ /* our min() is unusable in constant expressions ;-/ */
...@@ -732,36 +734,48 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) ...@@ -732,36 +734,48 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
} }
/* /*
* variant of close_fd that gets a ref on the file for later fput. * See close_fd_get_file() below, this variant assumes current->files->file_lock
* The caller must ensure that filp_close() called on the file, and then * is held.
* an fput().
*/ */
int close_fd_get_file(unsigned int fd, struct file **res) int __close_fd_get_file(unsigned int fd, struct file **res)
{ {
struct files_struct *files = current->files; struct files_struct *files = current->files;
struct file *file; struct file *file;
struct fdtable *fdt; struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files); fdt = files_fdtable(files);
if (fd >= fdt->max_fds) if (fd >= fdt->max_fds)
goto out_unlock; goto out_err;
file = fdt->fd[fd]; file = fdt->fd[fd];
if (!file) if (!file)
goto out_unlock; goto out_err;
rcu_assign_pointer(fdt->fd[fd], NULL); rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd); __put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
get_file(file); get_file(file);
*res = file; *res = file;
return 0; return 0;
out_err:
out_unlock:
spin_unlock(&files->file_lock);
*res = NULL; *res = NULL;
return -ENOENT; return -ENOENT;
} }
/*
* variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
int ret;
spin_lock(&files->file_lock);
ret = __close_fd_get_file(fd, res);
spin_unlock(&files->file_lock);
return ret;
}
void do_close_on_exec(struct files_struct *files) void do_close_on_exec(struct files_struct *files)
{ {
unsigned i; unsigned i;
......
...@@ -133,6 +133,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, ...@@ -133,6 +133,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *); const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode); extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op); extern int build_open_flags(const struct open_how *how, struct open_flags *op);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode); int chmod_common(const struct path *path, umode_t mode);
......
...@@ -64,9 +64,7 @@ struct io_worker { ...@@ -64,9 +64,7 @@ struct io_worker {
#endif #endif
const struct cred *cur_creds; const struct cred *cur_creds;
const struct cred *saved_creds; const struct cred *saved_creds;
struct files_struct *restore_files;
struct nsproxy *restore_nsproxy; struct nsproxy *restore_nsproxy;
struct fs_struct *restore_fs;
}; };
#if BITS_PER_LONG == 64 #if BITS_PER_LONG == 64
...@@ -156,19 +154,19 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) ...@@ -156,19 +154,19 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->cur_creds = worker->saved_creds = NULL; worker->cur_creds = worker->saved_creds = NULL;
} }
if (current->files != worker->restore_files) { if (current->files) {
__acquire(&wqe->lock); __acquire(&wqe->lock);
raw_spin_unlock_irq(&wqe->lock); raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true; dropped_lock = true;
task_lock(current); task_lock(current);
current->files = worker->restore_files; current->files = NULL;
current->nsproxy = worker->restore_nsproxy; current->nsproxy = worker->restore_nsproxy;
task_unlock(current); task_unlock(current);
} }
if (current->fs != worker->restore_fs) if (current->fs)
current->fs = worker->restore_fs; current->fs = NULL;
/* /*
* If we have an active mm, we need to drop the wq lock before unusing * If we have an active mm, we need to drop the wq lock before unusing
...@@ -329,11 +327,11 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) ...@@ -329,11 +327,11 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
allow_kernel_signal(SIGINT); allow_kernel_signal(SIGINT);
current->flags |= PF_IO_WORKER; current->flags |= PF_IO_WORKER;
current->fs = NULL;
current->files = NULL;
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
worker->restore_nsproxy = current->nsproxy; worker->restore_nsproxy = current->nsproxy;
worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker); io_wqe_inc_running(wqe, worker);
} }
...@@ -555,23 +553,21 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -555,23 +553,21 @@ static void io_worker_handle_work(struct io_worker *worker)
/* handle a whole dependent link */ /* handle a whole dependent link */
do { do {
struct io_wq_work *old_work, *next_hashed, *linked; struct io_wq_work *next_hashed, *linked;
unsigned int hash = io_get_work_hash(work); unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work); next_hashed = wq_next_work(work);
io_impersonate_work(worker, work); io_impersonate_work(worker, work);
wq->do_work(work);
io_assign_current_work(worker, NULL);
old_work = work; linked = wq->free_work(work);
linked = wq->do_work(work);
work = next_hashed; work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) { if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked; work = linked;
linked = NULL; linked = NULL;
} }
io_assign_current_work(worker, work); io_assign_current_work(worker, work);
wq->free_work(old_work);
if (linked) if (linked)
io_wqe_enqueue(wqe, linked); io_wqe_enqueue(wqe, linked);
...@@ -850,11 +846,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) ...@@ -850,11 +846,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
do { do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL; work->flags |= IO_WQ_WORK_CANCEL;
work = wq->do_work(work); wq->do_work(work);
wq->free_work(old_work); work = wq->free_work(work);
} while (work); } while (work);
} }
...@@ -944,7 +938,6 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) ...@@ -944,7 +938,6 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
*/ */
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work && if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) { match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1); send_sig(SIGINT, worker->task, 1);
match->nr_running++; match->nr_running++;
......
...@@ -9,7 +9,6 @@ enum { ...@@ -9,7 +9,6 @@ enum {
IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2, IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4, IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16, IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32, IO_WQ_WORK_FILES = 32,
...@@ -28,15 +27,6 @@ enum io_wq_cancel { ...@@ -28,15 +27,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */ IO_WQ_CANCEL_NOTFOUND, /* work not found */
}; };
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
static inline void wq_list_add_after(struct io_wq_work_node *node, static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos, struct io_wq_work_node *pos,
struct io_wq_work_list *list) struct io_wq_work_list *list)
...@@ -107,8 +97,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) ...@@ -107,8 +97,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list); return container_of(work->list.next, struct io_wq_work, list);
} }
typedef void (free_work_fn)(struct io_wq_work *); typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data { struct io_wq_data {
struct user_struct *user; struct user_struct *user;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -20,7 +20,7 @@ static const char *proc_self_get_link(struct dentry *dentry, ...@@ -20,7 +20,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
* Not currently supported. Once we can inherit all of struct pid, * Not currently supported. Once we can inherit all of struct pid,
* we can allow this. * we can allow this.
*/ */
if (current->flags & PF_KTHREAD) if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP); return ERR_PTR(-EOPNOTSUPP);
if (!tgid) if (!tgid)
......
...@@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, ...@@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
pid_t pid = task_pid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns);
char *name; char *name;
/*
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!pid) if (!pid)
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
......
...@@ -22,6 +22,15 @@ struct io_identity { ...@@ -22,6 +22,15 @@ struct io_identity {
refcount_t count; refcount_t count;
}; };
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
struct io_uring_task { struct io_uring_task {
/* submission side */ /* submission side */
struct xarray xa; struct xarray xa;
...@@ -32,6 +41,11 @@ struct io_uring_task { ...@@ -32,6 +41,11 @@ struct io_uring_task {
struct io_identity *identity; struct io_identity *identity;
atomic_t in_idle; atomic_t in_idle;
bool sqpoll; bool sqpoll;
spinlock_t task_lock;
struct io_wq_work_list task_list;
unsigned long task_state;
struct callback_head task_work;
}; };
#if defined(CONFIG_IO_URING) #if defined(CONFIG_IO_URING)
......
...@@ -285,12 +285,22 @@ enum { ...@@ -285,12 +285,22 @@ enum {
IORING_REGISTER_LAST IORING_REGISTER_LAST
}; };
/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update { struct io_uring_files_update {
__u32 offset; __u32 offset;
__u32 resv; __u32 resv;
__aligned_u64 /* __s32 * */ fds; __aligned_u64 /* __s32 * */ fds;
}; };
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
__aligned_u64 data;
};
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
#define IO_URING_OP_SUPPORTED (1U << 0) #define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op { struct io_uring_probe_op {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment