Commit 5bbb336b authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.12/io_uring-2021-02-17' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "Highlights from this cycles are things like request recycling and
  task_work optimizations, which net us anywhere from 10-20% of speedups
  on workloads that mostly are inline.

  This work was originally done to put io_uring under memcg, which adds
  considerable overhead. But it's a really nice win as well. Also worth
  highlighting is the LOOKUP_CACHED work in the VFS, and using it in
  io_uring. Greatly speeds up the fast path for file opens.

  Summary:

   - Put io_uring under memcg protection. We accounted just the rings
     themselves under rlimit memlock before, now we account everything.

   - Request cache recycling, persistent across invocations (Pavel, me)

   - First part of a cleanup/improvement to buffer registration (Bijan)

   - SQPOLL fixes (Hao)

   - File registration NULL pointer fixup (Dan)

   - LOOKUP_CACHED support for io_uring

   - Disable /proc/thread-self/ for io_uring, like we do for /proc/self

   - Add Pavel to the io_uring MAINTAINERS entry

   - Tons of code cleanups and optimizations (Pavel)

   - Support for skip entries in file registration (Noah)"

* tag 'for-5.12/io_uring-2021-02-17' of git://git.kernel.dk/linux-block: (103 commits)
  io_uring: tctx->task_lock should be IRQ safe
  proc: don't allow async path resolution of /proc/thread-self components
  io_uring: kill cached requests from exiting task closing the ring
  io_uring: add helper to free all request caches
  io_uring: allow task match to be passed to io_req_cache_free()
  io-wq: clear out worker ->fs and ->files
  io_uring: optimise io_init_req() flags setting
  io_uring: clean io_req_find_next() fast check
  io_uring: don't check PF_EXITING from syscall
  io_uring: don't split out consume out of SQE get
  io_uring: save ctx put/get for task_work submit
  io_uring: don't duplicate io_req_task_queue()
  io_uring: optimise SQPOLL mm/files grabbing
  io_uring: optimise out unlikely link queue
  io_uring: take compl state from submit state
  io_uring: inline io_complete_rw_common()
  io_uring: move res check out of io_rw_reissue()
  io_uring: simplify iopoll reissuing
  io_uring: clean up io_req_free_batch_finish()
  io_uring: move submit side state closer in the ring
  ...
parents 9820b4dc 0b81e80c
......@@ -6830,6 +6830,9 @@ F: include/linux/fs.h
F: include/linux/fs_types.h
F: include/uapi/linux/fs.h
F: include/uapi/linux/openat2.h
X: fs/io-wq.c
X: fs/io-wq.h
X: fs/io_uring.c
FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER
M: Riku Voipio <riku.voipio@iki.fi>
......@@ -9263,6 +9266,7 @@ F: include/uapi/linux/iommu.h
IO_URING
M: Jens Axboe <axboe@kernel.dk>
R: Pavel Begunkov <asml.silence@gmail.com>
L: io-uring@vger.kernel.org
S: Maintained
T: git git://git.kernel.dk/linux-block
......@@ -9270,6 +9274,7 @@ T: git git://git.kernel.dk/liburing
F: fs/io-wq.c
F: fs/io-wq.h
F: fs/io_uring.c
F: include/linux/io_uring.h
F: include/uapi/linux/io_uring.h
IPMI SUBSYSTEM
......
......@@ -22,6 +22,8 @@
#include <linux/close_range.h>
#include <net/sock.h>
#include "internal.h"
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
......@@ -732,36 +734,48 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
}
/*
* variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
* See close_fd_get_file() below, this variant assumes current->files->file_lock
* is held.
*/
int close_fd_get_file(unsigned int fd, struct file **res)
int __close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
struct file *file;
struct fdtable *fdt;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
goto out_err;
file = fdt->fd[fd];
if (!file)
goto out_unlock;
goto out_err;
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return 0;
out_unlock:
spin_unlock(&files->file_lock);
out_err:
*res = NULL;
return -ENOENT;
}
/*
* variant of close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int close_fd_get_file(unsigned int fd, struct file **res)
{
struct files_struct *files = current->files;
int ret;
spin_lock(&files->file_lock);
ret = __close_fd_get_file(fd, res);
spin_unlock(&files->file_lock);
return ret;
}
void do_close_on_exec(struct files_struct *files)
{
unsigned i;
......
......@@ -133,6 +133,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
extern int __close_fd_get_file(unsigned int fd, struct file **res);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
......
......@@ -64,9 +64,7 @@ struct io_worker {
#endif
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
struct nsproxy *restore_nsproxy;
struct fs_struct *restore_fs;
};
#if BITS_PER_LONG == 64
......@@ -156,19 +154,19 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
if (current->files) {
__acquire(&wqe->lock);
raw_spin_unlock_irq(&wqe->lock);
dropped_lock = true;
task_lock(current);
current->files = worker->restore_files;
current->files = NULL;
current->nsproxy = worker->restore_nsproxy;
task_unlock(current);
}
if (current->fs != worker->restore_fs)
current->fs = worker->restore_fs;
if (current->fs)
current->fs = NULL;
/*
* If we have an active mm, we need to drop the wq lock before unusing
......@@ -329,11 +327,11 @@ static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker)
allow_kernel_signal(SIGINT);
current->flags |= PF_IO_WORKER;
current->fs = NULL;
current->files = NULL;
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
worker->restore_files = current->files;
worker->restore_nsproxy = current->nsproxy;
worker->restore_fs = current->fs;
io_wqe_inc_running(wqe, worker);
}
......@@ -555,23 +553,21 @@ static void io_worker_handle_work(struct io_worker *worker)
/* handle a whole dependent link */
do {
struct io_wq_work *old_work, *next_hashed, *linked;
struct io_wq_work *next_hashed, *linked;
unsigned int hash = io_get_work_hash(work);
next_hashed = wq_next_work(work);
io_impersonate_work(worker, work);
wq->do_work(work);
io_assign_current_work(worker, NULL);
old_work = work;
linked = wq->do_work(work);
linked = wq->free_work(work);
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
linked = NULL;
}
io_assign_current_work(worker, work);
wq->free_work(old_work);
if (linked)
io_wqe_enqueue(wqe, linked);
......@@ -850,11 +846,9 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
struct io_wq *wq = wqe->wq;
do {
struct io_wq_work *old_work = work;
work->flags |= IO_WQ_WORK_CANCEL;
work = wq->do_work(work);
wq->free_work(old_work);
wq->do_work(work);
work = wq->free_work(work);
} while (work);
}
......@@ -944,7 +938,6 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
match->fn(worker->cur_work, match->data)) {
send_sig(SIGINT, worker->task, 1);
match->nr_running++;
......
......@@ -9,7 +9,6 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HASHED = 2,
IO_WQ_WORK_UNBOUND = 4,
IO_WQ_WORK_NO_CANCEL = 8,
IO_WQ_WORK_CONCURRENT = 16,
IO_WQ_WORK_FILES = 32,
......@@ -28,15 +27,6 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
static inline void wq_list_add_after(struct io_wq_work_node *node,
struct io_wq_work_node *pos,
struct io_wq_work_list *list)
......@@ -107,8 +97,8 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
return container_of(work->list.next, struct io_wq_work, list);
}
typedef void (free_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
typedef void (io_wq_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct user_struct *user;
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -20,7 +20,7 @@ static const char *proc_self_get_link(struct dentry *dentry,
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
if (current->flags & PF_KTHREAD)
if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!tgid)
......
......@@ -17,6 +17,13 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
pid_t pid = task_pid_nr_ns(current, ns);
char *name;
/*
* Not currently supported. Once we can inherit all of struct pid,
* we can allow this.
*/
if (current->flags & PF_IO_WORKER)
return ERR_PTR(-EOPNOTSUPP);
if (!pid)
return ERR_PTR(-ENOENT);
name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
......
......@@ -22,6 +22,15 @@ struct io_identity {
refcount_t count;
};
struct io_wq_work_node {
struct io_wq_work_node *next;
};
struct io_wq_work_list {
struct io_wq_work_node *first;
struct io_wq_work_node *last;
};
struct io_uring_task {
/* submission side */
struct xarray xa;
......@@ -32,6 +41,11 @@ struct io_uring_task {
struct io_identity *identity;
atomic_t in_idle;
bool sqpoll;
spinlock_t task_lock;
struct io_wq_work_list task_list;
unsigned long task_state;
struct callback_head task_work;
};
#if defined(CONFIG_IO_URING)
......
......@@ -285,12 +285,22 @@ enum {
IORING_REGISTER_LAST
};
/* deprecated, see struct io_uring_rsrc_update */
struct io_uring_files_update {
__u32 offset;
__u32 resv;
__aligned_u64 /* __s32 * */ fds;
};
struct io_uring_rsrc_update {
__u32 offset;
__u32 resv;
__aligned_u64 data;
};
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment