Commit c9a925b7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux

Pull io_uring fixes from Jens Axboe:

 - Fix an issue with discontig page checking for IORING_SETUP_NO_MMAP

 - Fix an issue with not allowing IORING_SETUP_NO_MMAP also disallowing
   mmap'ed buffer rings

 - Fix an issue with deferred release of memory mapped pages

 - Fix a lockdep issue with IORING_SETUP_NO_MMAP

 - Use fget/fput consistently, even from our sync system calls. No real
   issue here, but if we were ever to allow closing io_uring descriptors
   it would be required. Let's play it safe and just use the full ref
   counted versions upfront. Most uses of io_uring are threaded anyway,
   and hence already doing the full version underneath.

* tag 'io_uring-6.7-2023-11-30' of git://git.kernel.dk/linux:
  io_uring: use fget/fput consistently
  io_uring: free io_buffer_list entries via RCU
  io_uring/kbuf: prune deferred locked cache when tearing down
  io_uring/kbuf: recycle freed mapped buffer ring entries
  io_uring/kbuf: defer release of mapped buffer rings
  io_uring: enable io_mem_alloc/free to be used in other parts
  io_uring: don't guard IORING_OFF_PBUF_RING with SETUP_NO_MMAP
  io_uring: don't allow discontig pages for IORING_SETUP_NO_MMAP
parents ee0c8a9b 73363c26
...@@ -340,6 +340,9 @@ struct io_ring_ctx { ...@@ -340,6 +340,9 @@ struct io_ring_ctx {
struct list_head io_buffers_cache; struct list_head io_buffers_cache;
/* deferred free list, protected by ->uring_lock */
struct hlist_head io_buf_list;
/* Keep this last, we don't need it for the fast path */ /* Keep this last, we don't need it for the fast path */
struct wait_queue_head poll_wq; struct wait_queue_head poll_wq;
struct io_restriction restrictions; struct io_restriction restrictions;
......
...@@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) ...@@ -273,7 +273,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
}; };
ktime_t timeout = KTIME_MAX; ktime_t timeout = KTIME_MAX;
struct io_uring_sync_cancel_reg sc; struct io_uring_sync_cancel_reg sc;
struct fd f = { }; struct file *file = NULL;
DEFINE_WAIT(wait); DEFINE_WAIT(wait);
int ret, i; int ret, i;
...@@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) ...@@ -295,10 +295,10 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
/* we can grab a normal file descriptor upfront */ /* we can grab a normal file descriptor upfront */
if ((cd.flags & IORING_ASYNC_CANCEL_FD) && if ((cd.flags & IORING_ASYNC_CANCEL_FD) &&
!(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
f = fdget(sc.fd); file = fget(sc.fd);
if (!f.file) if (!file)
return -EBADF; return -EBADF;
cd.file = f.file; cd.file = file;
} }
ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); ret = __io_sync_cancel(current->io_uring, &cd, sc.fd);
...@@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) ...@@ -348,6 +348,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg)
if (ret == -ENOENT || ret > 0) if (ret == -ENOENT || ret > 0)
ret = 0; ret = 0;
out: out:
fdput(f); if (file)
fput(file);
return ret; return ret;
} }
...@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -325,6 +325,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->sqd_list);
INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache); INIT_LIST_HEAD(&ctx->io_buffers_cache);
INIT_HLIST_HEAD(&ctx->io_buf_list);
io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
sizeof(struct io_rsrc_node)); sizeof(struct io_rsrc_node));
io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX,
...@@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ...@@ -2666,7 +2667,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
} }
static void io_mem_free(void *ptr) void io_mem_free(void *ptr)
{ {
if (!ptr) if (!ptr)
return; return;
...@@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, ...@@ -2697,6 +2698,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
{ {
struct page **page_array; struct page **page_array;
unsigned int nr_pages; unsigned int nr_pages;
void *page_addr;
int ret, i; int ret, i;
*npages = 0; *npages = 0;
...@@ -2718,27 +2720,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, ...@@ -2718,27 +2720,29 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
io_pages_free(&page_array, ret > 0 ? ret : 0); io_pages_free(&page_array, ret > 0 ? ret : 0);
return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT); return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
} }
/*
* Should be a single page. If the ring is small enough that we can
* use a normal page, that is fine. If we need multiple pages, then
* userspace should use a huge page. That's the only way to guarantee
* that we get contigious memory, outside of just being lucky or
* (currently) having low memory fragmentation.
*/
if (page_array[0] != page_array[ret - 1])
goto err;
/* page_addr = page_address(page_array[0]);
* Can't support mapping user allocated ring memory on 32-bit archs
* where it could potentially reside in highmem. Just fail those with
* -EINVAL, just like we did on kernels that didn't support this
* feature.
*/
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
if (PageHighMem(page_array[i])) { ret = -EINVAL;
ret = -EINVAL;
/*
* Can't support mapping user allocated ring memory on 32-bit
* archs where it could potentially reside in highmem. Just
* fail those with -EINVAL, just like we did on kernels that
* didn't support this feature.
*/
if (PageHighMem(page_array[i]))
goto err; goto err;
}
/*
* No support for discontig pages for now, should either be a
* single normal page, or a huge page. Later on we can add
* support for remapping discontig pages, for now we will
* just fail them with EINVAL.
*/
if (page_address(page_array[i]) != page_addr)
goto err;
page_addr += PAGE_SIZE;
} }
*pages = page_array; *pages = page_array;
...@@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx) ...@@ -2775,7 +2779,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
} }
} }
static void *io_mem_alloc(size_t size) void *io_mem_alloc(size_t size)
{ {
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
void *ret; void *ret;
...@@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ...@@ -2947,6 +2951,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
ctx->mm_account = NULL; ctx->mm_account = NULL;
} }
io_rings_free(ctx); io_rings_free(ctx);
io_kbuf_mmap_list_free(ctx);
percpu_ref_exit(&ctx->refs); percpu_ref_exit(&ctx->refs);
free_uid(ctx->user); free_uid(ctx->user);
...@@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file, ...@@ -3475,25 +3480,27 @@ static void *io_uring_validate_mmap_request(struct file *file,
struct page *page; struct page *page;
void *ptr; void *ptr;
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
switch (offset & IORING_OFF_MMAP_MASK) { switch (offset & IORING_OFF_MMAP_MASK) {
case IORING_OFF_SQ_RING: case IORING_OFF_SQ_RING:
case IORING_OFF_CQ_RING: case IORING_OFF_CQ_RING:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
ptr = ctx->rings; ptr = ctx->rings;
break; break;
case IORING_OFF_SQES: case IORING_OFF_SQES:
/* Don't allow mmap if the ring was setup without it */
if (ctx->flags & IORING_SETUP_NO_MMAP)
return ERR_PTR(-EINVAL);
ptr = ctx->sq_sqes; ptr = ctx->sq_sqes;
break; break;
case IORING_OFF_PBUF_RING: { case IORING_OFF_PBUF_RING: {
unsigned int bgid; unsigned int bgid;
bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
mutex_lock(&ctx->uring_lock); rcu_read_lock();
ptr = io_pbuf_get_address(ctx, bgid); ptr = io_pbuf_get_address(ctx, bgid);
mutex_unlock(&ctx->uring_lock); rcu_read_unlock();
if (!ptr) if (!ptr)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
break; break;
...@@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ...@@ -3645,7 +3652,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
size_t, argsz) size_t, argsz)
{ {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
struct fd f; struct file *file;
long ret; long ret;
if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
...@@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ...@@ -3663,20 +3670,19 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL; return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd]; file = tctx->registered_rings[fd];
f.flags = 0; if (unlikely(!file))
if (unlikely(!f.file))
return -EBADF; return -EBADF;
} else { } else {
f = fdget(fd); file = fget(fd);
if (unlikely(!f.file)) if (unlikely(!file))
return -EBADF; return -EBADF;
ret = -EOPNOTSUPP; ret = -EOPNOTSUPP;
if (unlikely(!io_is_uring_fops(f.file))) if (unlikely(!io_is_uring_fops(file)))
goto out; goto out;
} }
ctx = f.file->private_data; ctx = file->private_data;
ret = -EBADFD; ret = -EBADFD;
if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED))
goto out; goto out;
...@@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ...@@ -3770,7 +3776,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
} }
} }
out: out:
fdput(f); if (!(flags & IORING_ENTER_REGISTERED_RING))
fput(file);
return ret; return ret;
} }
...@@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ...@@ -4611,7 +4618,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
{ {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
long ret = -EBADF; long ret = -EBADF;
struct fd f; struct file *file;
bool use_registered_ring; bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
...@@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ...@@ -4630,27 +4637,27 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL; return -EINVAL;
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
f.file = tctx->registered_rings[fd]; file = tctx->registered_rings[fd];
f.flags = 0; if (unlikely(!file))
if (unlikely(!f.file))
return -EBADF; return -EBADF;
} else { } else {
f = fdget(fd); file = fget(fd);
if (unlikely(!f.file)) if (unlikely(!file))
return -EBADF; return -EBADF;
ret = -EOPNOTSUPP; ret = -EOPNOTSUPP;
if (!io_is_uring_fops(f.file)) if (!io_is_uring_fops(file))
goto out_fput; goto out_fput;
} }
ctx = f.file->private_data; ctx = file->private_data;
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args); ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput: out_fput:
fdput(f); if (!use_registered_ring)
fput(file);
return ret; return ret;
} }
......
...@@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); ...@@ -86,6 +86,9 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all); bool cancel_all);
void *io_mem_alloc(size_t size);
void io_mem_free(void *ptr);
#if defined(CONFIG_PROVE_LOCKING) #if defined(CONFIG_PROVE_LOCKING)
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
{ {
......
...@@ -33,19 +33,42 @@ struct io_provide_buf { ...@@ -33,19 +33,42 @@ struct io_provide_buf {
__u16 bid; __u16 bid;
}; };
struct io_buf_free {
struct hlist_node list;
void *mem;
size_t size;
int inuse;
};
static struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl,
unsigned int bgid)
{
if (bl && bgid < BGID_ARRAY)
return &bl[bgid];
return xa_load(&ctx->io_bl_xa, bgid);
}
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx,
unsigned int bgid) unsigned int bgid)
{ {
if (ctx->io_bl && bgid < BGID_ARRAY) lockdep_assert_held(&ctx->uring_lock);
return &ctx->io_bl[bgid];
return xa_load(&ctx->io_bl_xa, bgid); return __io_buffer_get_list(ctx, ctx->io_bl, bgid);
} }
static int io_buffer_add_list(struct io_ring_ctx *ctx, static int io_buffer_add_list(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned int bgid) struct io_buffer_list *bl, unsigned int bgid)
{ {
/*
* Store buffer group ID and finally mark the list as visible.
* The normal lookup doesn't care about the visibility as we're
* always under the ->uring_lock, but the RCU lookup from mmap does.
*/
bl->bgid = bgid; bl->bgid = bgid;
smp_store_release(&bl->is_ready, 1);
if (bgid < BGID_ARRAY) if (bgid < BGID_ARRAY)
return 0; return 0;
...@@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, ...@@ -196,21 +219,40 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
static __cold int io_init_bl_list(struct io_ring_ctx *ctx) static __cold int io_init_bl_list(struct io_ring_ctx *ctx)
{ {
struct io_buffer_list *bl;
int i; int i;
ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), GFP_KERNEL);
GFP_KERNEL); if (!bl)
if (!ctx->io_bl)
return -ENOMEM; return -ENOMEM;
for (i = 0; i < BGID_ARRAY; i++) { for (i = 0; i < BGID_ARRAY; i++) {
INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); INIT_LIST_HEAD(&bl[i].buf_list);
ctx->io_bl[i].bgid = i; bl[i].bgid = i;
} }
smp_store_release(&ctx->io_bl, bl);
return 0; return 0;
} }
/*
* Mark the given mapped range as free for reuse
*/
static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
struct io_buf_free *ibf;
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
if (bl->buf_ring == ibf->mem) {
ibf->inuse = 0;
return;
}
}
/* can't happen... */
WARN_ON_ONCE(1);
}
static int __io_remove_buffers(struct io_ring_ctx *ctx, static int __io_remove_buffers(struct io_ring_ctx *ctx,
struct io_buffer_list *bl, unsigned nbufs) struct io_buffer_list *bl, unsigned nbufs)
{ {
...@@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, ...@@ -223,7 +265,11 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->is_mapped) { if (bl->is_mapped) {
i = bl->buf_ring->tail - bl->head; i = bl->buf_ring->tail - bl->head;
if (bl->is_mmap) { if (bl->is_mmap) {
folio_put(virt_to_folio(bl->buf_ring)); /*
* io_kbuf_list_free() will free the page(s) at
* ->release() time.
*/
io_kbuf_mark_free(ctx, bl);
bl->buf_ring = NULL; bl->buf_ring = NULL;
bl->is_mmap = 0; bl->is_mmap = 0;
} else if (bl->buf_nr_pages) { } else if (bl->buf_nr_pages) {
...@@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) ...@@ -274,9 +320,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
xa_for_each(&ctx->io_bl_xa, index, bl) { xa_for_each(&ctx->io_bl_xa, index, bl) {
xa_erase(&ctx->io_bl_xa, bl->bgid); xa_erase(&ctx->io_bl_xa, bl->bgid);
__io_remove_buffers(ctx, bl, -1U); __io_remove_buffers(ctx, bl, -1U);
kfree(bl); kfree_rcu(bl, rcu);
} }
/*
* Move deferred locked entries to cache before pruning
*/
spin_lock(&ctx->completion_lock);
if (!list_empty(&ctx->io_buffers_comp))
list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache);
spin_unlock(&ctx->completion_lock);
list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { list_for_each_safe(item, tmp, &ctx->io_buffers_cache) {
buf = list_entry(item, struct io_buffer, list); buf = list_entry(item, struct io_buffer, list);
kmem_cache_free(io_buf_cachep, buf); kmem_cache_free(io_buf_cachep, buf);
...@@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) ...@@ -460,7 +514,16 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid); ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) { if (ret) {
kfree(bl); /*
* Doesn't need rcu free as it was never visible, but
* let's keep it consistent throughout. Also can't
* be a lower indexed array group, as adding one
* where lookup failed cannot happen.
*/
if (p->bgid >= BGID_ARRAY)
kfree_rcu(bl, rcu);
else
WARN_ON_ONCE(1);
goto err; goto err;
} }
} }
...@@ -531,19 +594,63 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, ...@@ -531,19 +594,63 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
return -EINVAL; return -EINVAL;
} }
static int io_alloc_pbuf_ring(struct io_uring_buf_reg *reg, /*
* See if we have a suitable region that we can reuse, rather than allocate
* both a new io_buf_free and mem region again. We leave it on the list as
* even a reused entry will need freeing at ring release.
*/
static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx,
size_t ring_size)
{
struct io_buf_free *ibf, *best = NULL;
size_t best_dist;
hlist_for_each_entry(ibf, &ctx->io_buf_list, list) {
size_t dist;
if (ibf->inuse || ibf->size < ring_size)
continue;
dist = ibf->size - ring_size;
if (!best || dist < best_dist) {
best = ibf;
if (!dist)
break;
best_dist = dist;
}
}
return best;
}
static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
struct io_buffer_list *bl) struct io_buffer_list *bl)
{ {
gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; struct io_buf_free *ibf;
size_t ring_size; size_t ring_size;
void *ptr; void *ptr;
ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
ptr = (void *) __get_free_pages(gfp, get_order(ring_size));
if (!ptr)
return -ENOMEM;
bl->buf_ring = ptr; /* Reuse existing entry, if we can */
ibf = io_lookup_buf_free_entry(ctx, ring_size);
if (!ibf) {
ptr = io_mem_alloc(ring_size);
if (!ptr)
return -ENOMEM;
/* Allocate and store deferred free entry */
ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT);
if (!ibf) {
io_mem_free(ptr);
return -ENOMEM;
}
ibf->mem = ptr;
ibf->size = ring_size;
hlist_add_head(&ibf->list, &ctx->io_buf_list);
}
ibf->inuse = 1;
bl->buf_ring = ibf->mem;
bl->is_mapped = 1; bl->is_mapped = 1;
bl->is_mmap = 1; bl->is_mmap = 1;
return 0; return 0;
...@@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -555,6 +662,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_buffer_list *bl, *free_bl = NULL; struct io_buffer_list *bl, *free_bl = NULL;
int ret; int ret;
lockdep_assert_held(&ctx->uring_lock);
if (copy_from_user(&reg, arg, sizeof(reg))) if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT; return -EFAULT;
...@@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -599,7 +708,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(reg.flags & IOU_PBUF_RING_MMAP)) if (!(reg.flags & IOU_PBUF_RING_MMAP))
ret = io_pin_pbuf_ring(&reg, bl); ret = io_pin_pbuf_ring(&reg, bl);
else else
ret = io_alloc_pbuf_ring(&reg, bl); ret = io_alloc_pbuf_ring(ctx, &reg, bl);
if (!ret) { if (!ret) {
bl->nr_entries = reg.ring_entries; bl->nr_entries = reg.ring_entries;
...@@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -609,7 +718,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return 0; return 0;
} }
kfree(free_bl); kfree_rcu(free_bl, rcu);
return ret; return ret;
} }
...@@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -618,6 +727,8 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
struct io_uring_buf_reg reg; struct io_uring_buf_reg reg;
struct io_buffer_list *bl; struct io_buffer_list *bl;
lockdep_assert_held(&ctx->uring_lock);
if (copy_from_user(&reg, arg, sizeof(reg))) if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT; return -EFAULT;
if (reg.resv[0] || reg.resv[1] || reg.resv[2]) if (reg.resv[0] || reg.resv[1] || reg.resv[2])
...@@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -634,7 +745,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
__io_remove_buffers(ctx, bl, -1U); __io_remove_buffers(ctx, bl, -1U);
if (bl->bgid >= BGID_ARRAY) { if (bl->bgid >= BGID_ARRAY) {
xa_erase(&ctx->io_bl_xa, bl->bgid); xa_erase(&ctx->io_bl_xa, bl->bgid);
kfree(bl); kfree_rcu(bl, rcu);
} }
return 0; return 0;
} }
...@@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) ...@@ -643,9 +754,33 @@ void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid)
{ {
struct io_buffer_list *bl; struct io_buffer_list *bl;
bl = io_buffer_get_list(ctx, bgid); bl = __io_buffer_get_list(ctx, smp_load_acquire(&ctx->io_bl), bgid);
/*
* Ensure the list is fully setup. Only strictly needed for RCU lookup
* via mmap, and in that case only for the array indexed groups. For
* the xarray lookups, it's either visible and ready, or not at all.
*/
if (!smp_load_acquire(&bl->is_ready))
return NULL;
if (!bl || !bl->is_mmap) if (!bl || !bl->is_mmap)
return NULL; return NULL;
return bl->buf_ring; return bl->buf_ring;
} }
/*
* Called at or after ->release(), free the mmap'ed buffers that we used
* for memory mapped provided buffer rings.
*/
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx)
{
struct io_buf_free *ibf;
struct hlist_node *tmp;
hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) {
hlist_del(&ibf->list);
io_mem_free(ibf->mem);
kfree(ibf);
}
}
...@@ -15,6 +15,7 @@ struct io_buffer_list { ...@@ -15,6 +15,7 @@ struct io_buffer_list {
struct page **buf_pages; struct page **buf_pages;
struct io_uring_buf_ring *buf_ring; struct io_uring_buf_ring *buf_ring;
}; };
struct rcu_head rcu;
}; };
__u16 bgid; __u16 bgid;
...@@ -28,6 +29,8 @@ struct io_buffer_list { ...@@ -28,6 +29,8 @@ struct io_buffer_list {
__u8 is_mapped; __u8 is_mapped;
/* ring mapped provided buffers, but mmap'ed by application */ /* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap; __u8 is_mmap;
/* bl is visible from an RCU point of view for lookup */
__u8 is_ready;
}; };
struct io_buffer { struct io_buffer {
...@@ -51,6 +54,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); ...@@ -51,6 +54,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags);
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment