Commit acf913b7 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-2019-10-26' of git://git.kernel.dk/linux-block

Pull block and io_uring fixes from Jens Axboe:
 "A bit bigger than usual at this point in time, mostly due to some good
  bug hunting work by Pavel that resulted in three io_uring fixes from
  him and two from me. Anyway, this pull request contains:

   - Revert of the submit-and-wait optimization for io_uring, it can't
     always be done safely. It depends on commands always making
     progress on their own, which isn't necessarily the case outside of
     strict file IO. (me)

   - Series of two patches from me and three from Pavel, fixing issues
     with shared data and sequencing for io_uring.

   - Lastly, two timeout sequence fixes for io_uring (zhangyi)

   - Two nbd patches fixing races (Josef)

   - libahci regulator_get_optional() fix (Mark)"

* tag 'for-linus-2019-10-26' of git://git.kernel.dk/linux-block:
  nbd: verify socket is supported during setup
  ata: libahci_platform: Fix regulator_get_optional() misuse
  nbd: handle racing with error'ed out commands
  nbd: protect cmd->status with cmd->lock
  io_uring: fix bad inflight accounting for SETUP_IOPOLL|SETUP_SQTHREAD
  io_uring: used cached copies of sq->dropped and cq->overflow
  io_uring: Fix race for sqes with userspace
  io_uring: Fix broken links with offloading
  io_uring: Fix corrupted user_data
  io_uring: correct timeout req sequence when inserting a new entry
  io_uring : correct timeout req sequence when waiting timeout
  io_uring: revert "io_uring: optimize submit_and_wait API"
parents f877bee5 cf1b2326
...@@ -153,17 +153,13 @@ int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv) ...@@ -153,17 +153,13 @@ int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv)
{ {
int rc, i; int rc, i;
if (hpriv->ahci_regulator) { rc = regulator_enable(hpriv->ahci_regulator);
rc = regulator_enable(hpriv->ahci_regulator); if (rc)
if (rc) return rc;
return rc;
}
if (hpriv->phy_regulator) { rc = regulator_enable(hpriv->phy_regulator);
rc = regulator_enable(hpriv->phy_regulator); if (rc)
if (rc) goto disable_ahci_pwrs;
goto disable_ahci_pwrs;
}
for (i = 0; i < hpriv->nports; i++) { for (i = 0; i < hpriv->nports; i++) {
if (!hpriv->target_pwrs[i]) if (!hpriv->target_pwrs[i])
...@@ -181,11 +177,9 @@ int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv) ...@@ -181,11 +177,9 @@ int ahci_platform_enable_regulators(struct ahci_host_priv *hpriv)
if (hpriv->target_pwrs[i]) if (hpriv->target_pwrs[i])
regulator_disable(hpriv->target_pwrs[i]); regulator_disable(hpriv->target_pwrs[i]);
if (hpriv->phy_regulator) regulator_disable(hpriv->phy_regulator);
regulator_disable(hpriv->phy_regulator);
disable_ahci_pwrs: disable_ahci_pwrs:
if (hpriv->ahci_regulator) regulator_disable(hpriv->ahci_regulator);
regulator_disable(hpriv->ahci_regulator);
return rc; return rc;
} }
EXPORT_SYMBOL_GPL(ahci_platform_enable_regulators); EXPORT_SYMBOL_GPL(ahci_platform_enable_regulators);
...@@ -207,10 +201,8 @@ void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv) ...@@ -207,10 +201,8 @@ void ahci_platform_disable_regulators(struct ahci_host_priv *hpriv)
regulator_disable(hpriv->target_pwrs[i]); regulator_disable(hpriv->target_pwrs[i]);
} }
if (hpriv->ahci_regulator) regulator_disable(hpriv->ahci_regulator);
regulator_disable(hpriv->ahci_regulator); regulator_disable(hpriv->phy_regulator);
if (hpriv->phy_regulator)
regulator_disable(hpriv->phy_regulator);
} }
EXPORT_SYMBOL_GPL(ahci_platform_disable_regulators); EXPORT_SYMBOL_GPL(ahci_platform_disable_regulators);
/** /**
...@@ -359,7 +351,7 @@ static int ahci_platform_get_regulator(struct ahci_host_priv *hpriv, u32 port, ...@@ -359,7 +351,7 @@ static int ahci_platform_get_regulator(struct ahci_host_priv *hpriv, u32 port,
struct regulator *target_pwr; struct regulator *target_pwr;
int rc = 0; int rc = 0;
target_pwr = regulator_get_optional(dev, "target"); target_pwr = regulator_get(dev, "target");
if (!IS_ERR(target_pwr)) if (!IS_ERR(target_pwr))
hpriv->target_pwrs[port] = target_pwr; hpriv->target_pwrs[port] = target_pwr;
...@@ -436,16 +428,14 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev, ...@@ -436,16 +428,14 @@ struct ahci_host_priv *ahci_platform_get_resources(struct platform_device *pdev,
hpriv->clks[i] = clk; hpriv->clks[i] = clk;
} }
hpriv->ahci_regulator = devm_regulator_get_optional(dev, "ahci"); hpriv->ahci_regulator = devm_regulator_get(dev, "ahci");
if (IS_ERR(hpriv->ahci_regulator)) { if (IS_ERR(hpriv->ahci_regulator)) {
rc = PTR_ERR(hpriv->ahci_regulator); rc = PTR_ERR(hpriv->ahci_regulator);
if (rc == -EPROBE_DEFER) if (rc != 0)
goto err_out; goto err_out;
rc = 0;
hpriv->ahci_regulator = NULL;
} }
hpriv->phy_regulator = devm_regulator_get_optional(dev, "phy"); hpriv->phy_regulator = devm_regulator_get(dev, "phy");
if (IS_ERR(hpriv->phy_regulator)) { if (IS_ERR(hpriv->phy_regulator)) {
rc = PTR_ERR(hpriv->phy_regulator); rc = PTR_ERR(hpriv->phy_regulator);
if (rc == -EPROBE_DEFER) if (rc == -EPROBE_DEFER)
......
...@@ -385,17 +385,16 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, ...@@ -385,17 +385,16 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
struct nbd_device *nbd = cmd->nbd; struct nbd_device *nbd = cmd->nbd;
struct nbd_config *config; struct nbd_config *config;
if (!mutex_trylock(&cmd->lock))
return BLK_EH_RESET_TIMER;
if (!refcount_inc_not_zero(&nbd->config_refs)) { if (!refcount_inc_not_zero(&nbd->config_refs)) {
cmd->status = BLK_STS_TIMEOUT; cmd->status = BLK_STS_TIMEOUT;
mutex_unlock(&cmd->lock);
goto done; goto done;
} }
config = nbd->config; config = nbd->config;
if (!mutex_trylock(&cmd->lock)) {
nbd_config_put(nbd);
return BLK_EH_RESET_TIMER;
}
if (config->num_connections > 1) { if (config->num_connections > 1) {
dev_err_ratelimited(nbd_to_dev(nbd), dev_err_ratelimited(nbd_to_dev(nbd),
"Connection timed out, retrying (%d/%d alive)\n", "Connection timed out, retrying (%d/%d alive)\n",
...@@ -711,6 +710,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index) ...@@ -711,6 +710,12 @@ static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
ret = -ENOENT; ret = -ENOENT;
goto out; goto out;
} }
if (cmd->status != BLK_STS_OK) {
dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
req);
ret = -ENOENT;
goto out;
}
if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) { if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n", dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
req); req);
...@@ -792,7 +797,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved) ...@@ -792,7 +797,10 @@ static bool nbd_clear_req(struct request *req, void *data, bool reserved)
{ {
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
mutex_lock(&cmd->lock);
cmd->status = BLK_STS_IOERR; cmd->status = BLK_STS_IOERR;
mutex_unlock(&cmd->lock);
blk_mq_complete_request(req); blk_mq_complete_request(req);
return true; return true;
} }
...@@ -972,6 +980,25 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, ...@@ -972,6 +980,25 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
return ret; return ret;
} }
static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
int *err)
{
struct socket *sock;
*err = 0;
sock = sockfd_lookup(fd, err);
if (!sock)
return NULL;
if (sock->ops->shutdown == sock_no_shutdown) {
dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
*err = -EINVAL;
return NULL;
}
return sock;
}
static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
bool netlink) bool netlink)
{ {
...@@ -981,7 +1008,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, ...@@ -981,7 +1008,7 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
struct nbd_sock *nsock; struct nbd_sock *nsock;
int err; int err;
sock = sockfd_lookup(arg, &err); sock = nbd_get_socket(nbd, arg, &err);
if (!sock) if (!sock)
return err; return err;
...@@ -1033,7 +1060,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg) ...@@ -1033,7 +1060,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
int i; int i;
int err; int err;
sock = sockfd_lookup(arg, &err); sock = nbd_get_socket(nbd, arg, &err);
if (!sock) if (!sock)
return err; return err;
......
...@@ -197,6 +197,7 @@ struct io_ring_ctx { ...@@ -197,6 +197,7 @@ struct io_ring_ctx {
unsigned sq_entries; unsigned sq_entries;
unsigned sq_mask; unsigned sq_mask;
unsigned sq_thread_idle; unsigned sq_thread_idle;
unsigned cached_sq_dropped;
struct io_uring_sqe *sq_sqes; struct io_uring_sqe *sq_sqes;
struct list_head defer_list; struct list_head defer_list;
...@@ -212,6 +213,7 @@ struct io_ring_ctx { ...@@ -212,6 +213,7 @@ struct io_ring_ctx {
struct { struct {
unsigned cached_cq_tail; unsigned cached_cq_tail;
atomic_t cached_cq_overflow;
unsigned cq_entries; unsigned cq_entries;
unsigned cq_mask; unsigned cq_mask;
struct wait_queue_head cq_wait; struct wait_queue_head cq_wait;
...@@ -420,7 +422,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -420,7 +422,8 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, static inline bool __io_sequence_defer(struct io_ring_ctx *ctx,
struct io_kiocb *req) struct io_kiocb *req)
{ {
return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped
+ atomic_read(&ctx->cached_cq_overflow);
} }
static inline bool io_sequence_defer(struct io_ring_ctx *ctx, static inline bool io_sequence_defer(struct io_ring_ctx *ctx,
...@@ -567,9 +570,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, ...@@ -567,9 +570,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data,
WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, 0); WRITE_ONCE(cqe->flags, 0);
} else { } else {
unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1);
} }
} }
...@@ -735,6 +737,14 @@ static unsigned io_cqring_events(struct io_rings *rings) ...@@ -735,6 +737,14 @@ static unsigned io_cqring_events(struct io_rings *rings)
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
} }
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
/* make sure SQ entry isn't read before tail */
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
/* /*
* Find and free completed poll iocbs * Find and free completed poll iocbs
*/ */
...@@ -864,19 +874,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) ...@@ -864,19 +874,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
} }
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
long min) long min)
{ {
int iters, ret = 0; int iters = 0, ret = 0;
/*
* We disallow the app entering submit/complete with polling, but we
* still need to lock the ring to prevent racing with polled issue
* that got punted to a workqueue.
*/
mutex_lock(&ctx->uring_lock);
iters = 0;
do { do {
int tmin = 0; int tmin = 0;
...@@ -912,6 +914,21 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ...@@ -912,6 +914,21 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
ret = 0; ret = 0;
} while (min && !*nr_events && !need_resched()); } while (min && !*nr_events && !need_resched());
return ret;
}
static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
long min)
{
int ret;
/*
* We disallow the app entering submit/complete with polling, but we
* still need to lock the ring to prevent racing with polled issue
* that got punted to a workqueue.
*/
mutex_lock(&ctx->uring_lock);
ret = __io_iopoll_check(ctx, nr_events, min);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
return ret; return ret;
} }
...@@ -1877,7 +1894,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -1877,7 +1894,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
{ {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
struct io_kiocb *req; struct io_kiocb *req, *prev;
unsigned long flags; unsigned long flags;
req = container_of(timer, struct io_kiocb, timeout.timer); req = container_of(timer, struct io_kiocb, timeout.timer);
...@@ -1885,6 +1902,15 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) ...@@ -1885,6 +1902,15 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
atomic_inc(&ctx->cq_timeouts); atomic_inc(&ctx->cq_timeouts);
spin_lock_irqsave(&ctx->completion_lock, flags); spin_lock_irqsave(&ctx->completion_lock, flags);
/*
* Adjust the reqs sequence before the current one because it
* will consume a slot in the cq_ring and the the cq_tail pointer
* will be increased, otherwise other timeout reqs may return in
* advance without waiting for enough wait_nr.
*/
prev = req;
list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list)
prev->sequence++;
list_del(&req->list); list_del(&req->list);
io_cqring_fill_event(ctx, req->user_data, -ETIME); io_cqring_fill_event(ctx, req->user_data, -ETIME);
...@@ -1903,6 +1929,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -1903,6 +1929,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct list_head *entry; struct list_head *entry;
struct timespec64 ts; struct timespec64 ts;
unsigned span = 0;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL; return -EINVAL;
...@@ -1951,9 +1978,17 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -1951,9 +1978,17 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (ctx->cached_sq_head < nxt_sq_head) if (ctx->cached_sq_head < nxt_sq_head)
tmp += UINT_MAX; tmp += UINT_MAX;
if (tmp >= tmp_nxt) if (tmp > tmp_nxt)
break; break;
/*
* Sequence of reqs after the insert one and itself should
* be adjusted because each timeout req consumes a slot.
*/
span++;
nxt->sequence++;
} }
req->sequence -= span;
list_add(&req->list, entry); list_add(&req->list, entry);
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
...@@ -2292,11 +2327,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, ...@@ -2292,11 +2327,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
} }
static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct sqe_submit *s, bool force_nonblock) struct sqe_submit *s)
{ {
int ret; int ret;
ret = __io_submit_sqe(ctx, req, s, force_nonblock); ret = __io_submit_sqe(ctx, req, s, true);
/* /*
* We async punt it if the file wasn't marked NOWAIT, or if the file * We async punt it if the file wasn't marked NOWAIT, or if the file
...@@ -2343,7 +2378,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -2343,7 +2378,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
} }
static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct sqe_submit *s, bool force_nonblock) struct sqe_submit *s)
{ {
int ret; int ret;
...@@ -2356,18 +2391,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -2356,18 +2391,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0; return 0;
} }
return __io_queue_sqe(ctx, req, s, force_nonblock); return __io_queue_sqe(ctx, req, s);
} }
static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct sqe_submit *s, struct io_kiocb *shadow, struct sqe_submit *s, struct io_kiocb *shadow)
bool force_nonblock)
{ {
int ret; int ret;
int need_submit = false; int need_submit = false;
if (!shadow) if (!shadow)
return io_queue_sqe(ctx, req, s, force_nonblock); return io_queue_sqe(ctx, req, s);
/* /*
* Mark the first IO in link list as DRAIN, let all the following * Mark the first IO in link list as DRAIN, let all the following
...@@ -2396,7 +2430,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -2396,7 +2430,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
if (need_submit) if (need_submit)
return __io_queue_sqe(ctx, req, s, force_nonblock); return __io_queue_sqe(ctx, req, s);
return 0; return 0;
} }
...@@ -2404,8 +2438,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -2404,8 +2438,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req,
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
struct io_submit_state *state, struct io_kiocb **link, struct io_submit_state *state, struct io_kiocb **link)
bool force_nonblock)
{ {
struct io_uring_sqe *sqe_copy; struct io_uring_sqe *sqe_copy;
struct io_kiocb *req; struct io_kiocb *req;
...@@ -2432,6 +2465,8 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, ...@@ -2432,6 +2465,8 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
return; return;
} }
req->user_data = s->sqe->user_data;
/* /*
* If we already have a head request, queue this one for async * If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but * submittal once the head completes. If we don't have a head but
...@@ -2458,7 +2493,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, ...@@ -2458,7 +2493,7 @@ static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
INIT_LIST_HEAD(&req->link_list); INIT_LIST_HEAD(&req->link_list);
*link = req; *link = req;
} else { } else {
io_queue_sqe(ctx, req, s, force_nonblock); io_queue_sqe(ctx, req, s);
} }
} }
...@@ -2538,12 +2573,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) ...@@ -2538,12 +2573,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s)
/* drop invalid entries */ /* drop invalid entries */
ctx->cached_sq_head++; ctx->cached_sq_head++;
rings->sq_dropped++; ctx->cached_sq_dropped++;
WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
return false; return false;
} }
static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
unsigned int nr, bool has_user, bool mm_fault) bool has_user, bool mm_fault)
{ {
struct io_submit_state state, *statep = NULL; struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL; struct io_kiocb *link = NULL;
...@@ -2557,19 +2593,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, ...@@ -2557,19 +2593,23 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
} }
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
struct sqe_submit s;
if (!io_get_sqring(ctx, &s))
break;
/* /*
* If previous wasn't linked and we have a linked command, * If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link. * that's the end of the chain. Submit the previous link.
*/ */
if (!prev_was_link && link) { if (!prev_was_link && link) {
io_queue_link_head(ctx, link, &link->submit, shadow_req, io_queue_link_head(ctx, link, &link->submit, shadow_req);
true);
link = NULL; link = NULL;
shadow_req = NULL; shadow_req = NULL;
} }
prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) {
if (!shadow_req) { if (!shadow_req) {
shadow_req = io_get_req(ctx, NULL); shadow_req = io_get_req(ctx, NULL);
if (unlikely(!shadow_req)) if (unlikely(!shadow_req))
...@@ -2577,24 +2617,24 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, ...@@ -2577,24 +2617,24 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN);
refcount_dec(&shadow_req->refs); refcount_dec(&shadow_req->refs);
} }
shadow_req->sequence = sqes[i].sequence; shadow_req->sequence = s.sequence;
} }
out: out:
if (unlikely(mm_fault)) { if (unlikely(mm_fault)) {
io_cqring_add_event(ctx, sqes[i].sqe->user_data, io_cqring_add_event(ctx, s.sqe->user_data,
-EFAULT); -EFAULT);
} else { } else {
sqes[i].has_user = has_user; s.has_user = has_user;
sqes[i].needs_lock = true; s.needs_lock = true;
sqes[i].needs_fixed_file = true; s.needs_fixed_file = true;
io_submit_sqe(ctx, &sqes[i], statep, &link, true); io_submit_sqe(ctx, &s, statep, &link);
submitted++; submitted++;
} }
} }
if (link) if (link)
io_queue_link_head(ctx, link, &link->submit, shadow_req, true); io_queue_link_head(ctx, link, &link->submit, shadow_req);
if (statep) if (statep)
io_submit_state_end(&state); io_submit_state_end(&state);
...@@ -2603,7 +2643,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, ...@@ -2603,7 +2643,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
static int io_sq_thread(void *data) static int io_sq_thread(void *data)
{ {
struct sqe_submit sqes[IO_IOPOLL_BATCH];
struct io_ring_ctx *ctx = data; struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL; struct mm_struct *cur_mm = NULL;
mm_segment_t old_fs; mm_segment_t old_fs;
...@@ -2618,14 +2657,27 @@ static int io_sq_thread(void *data) ...@@ -2618,14 +2657,27 @@ static int io_sq_thread(void *data)
timeout = inflight = 0; timeout = inflight = 0;
while (!kthread_should_park()) { while (!kthread_should_park()) {
bool all_fixed, mm_fault = false; bool mm_fault = false;
int i; unsigned int to_submit;
if (inflight) { if (inflight) {
unsigned nr_events = 0; unsigned nr_events = 0;
if (ctx->flags & IORING_SETUP_IOPOLL) { if (ctx->flags & IORING_SETUP_IOPOLL) {
io_iopoll_check(ctx, &nr_events, 0); /*
* inflight is the count of the maximum possible
* entries we submitted, but it can be smaller
* if we dropped some of them. If we don't have
* poll entries available, then we know that we
* have nothing left to poll for. Reset the
* inflight count to zero in that case.
*/
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->poll_list))
__io_iopoll_check(ctx, &nr_events, 0);
else
inflight = 0;
mutex_unlock(&ctx->uring_lock);
} else { } else {
/* /*
* Normal IO, just pretend everything completed. * Normal IO, just pretend everything completed.
...@@ -2639,7 +2691,8 @@ static int io_sq_thread(void *data) ...@@ -2639,7 +2691,8 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle; timeout = jiffies + ctx->sq_thread_idle;
} }
if (!io_get_sqring(ctx, &sqes[0])) { to_submit = io_sqring_entries(ctx);
if (!to_submit) {
/* /*
* We're polling. If we're within the defined idle * We're polling. If we're within the defined idle
* period, then let us spin without work before going * period, then let us spin without work before going
...@@ -2670,7 +2723,8 @@ static int io_sq_thread(void *data) ...@@ -2670,7 +2723,8 @@ static int io_sq_thread(void *data)
/* make sure to read SQ tail after writing flags */ /* make sure to read SQ tail after writing flags */
smp_mb(); smp_mb();
if (!io_get_sqring(ctx, &sqes[0])) { to_submit = io_sqring_entries(ctx);
if (!to_submit) {
if (kthread_should_park()) { if (kthread_should_park()) {
finish_wait(&ctx->sqo_wait, &wait); finish_wait(&ctx->sqo_wait, &wait);
break; break;
...@@ -2688,19 +2742,8 @@ static int io_sq_thread(void *data) ...@@ -2688,19 +2742,8 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
} }
i = 0;
all_fixed = true;
do {
if (all_fixed && io_sqe_needs_user(sqes[i].sqe))
all_fixed = false;
i++;
if (i == ARRAY_SIZE(sqes))
break;
} while (io_get_sqring(ctx, &sqes[i]));
/* Unless all new commands are FIXED regions, grab mm */ /* Unless all new commands are FIXED regions, grab mm */
if (!all_fixed && !cur_mm) { if (!cur_mm) {
mm_fault = !mmget_not_zero(ctx->sqo_mm); mm_fault = !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) { if (!mm_fault) {
use_mm(ctx->sqo_mm); use_mm(ctx->sqo_mm);
...@@ -2708,8 +2751,9 @@ static int io_sq_thread(void *data) ...@@ -2708,8 +2751,9 @@ static int io_sq_thread(void *data)
} }
} }
inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, to_submit = min(to_submit, ctx->sq_entries);
mm_fault); inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL,
mm_fault);
/* Commit SQ ring head once we've consumed all SQEs */ /* Commit SQ ring head once we've consumed all SQEs */
io_commit_sqring(ctx); io_commit_sqring(ctx);
...@@ -2726,8 +2770,7 @@ static int io_sq_thread(void *data) ...@@ -2726,8 +2770,7 @@ static int io_sq_thread(void *data)
return 0; return 0;
} }
static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
bool block_for_last)
{ {
struct io_submit_state state, *statep = NULL; struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL; struct io_kiocb *link = NULL;
...@@ -2741,7 +2784,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, ...@@ -2741,7 +2784,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
} }
for (i = 0; i < to_submit; i++) { for (i = 0; i < to_submit; i++) {
bool force_nonblock = true;
struct sqe_submit s; struct sqe_submit s;
if (!io_get_sqring(ctx, &s)) if (!io_get_sqring(ctx, &s))
...@@ -2752,8 +2794,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, ...@@ -2752,8 +2794,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
* that's the end of the chain. Submit the previous link. * that's the end of the chain. Submit the previous link.
*/ */
if (!prev_was_link && link) { if (!prev_was_link && link) {
io_queue_link_head(ctx, link, &link->submit, shadow_req, io_queue_link_head(ctx, link, &link->submit, shadow_req);
force_nonblock);
link = NULL; link = NULL;
shadow_req = NULL; shadow_req = NULL;
} }
...@@ -2775,27 +2816,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, ...@@ -2775,27 +2816,16 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit,
s.needs_lock = false; s.needs_lock = false;
s.needs_fixed_file = false; s.needs_fixed_file = false;
submit++; submit++;
io_submit_sqe(ctx, &s, statep, &link);
/*
* The caller will block for events after submit, submit the
* last IO non-blocking. This is either the only IO it's
* submitting, or it already submitted the previous ones. This
* improves performance by avoiding an async punt that we don't
* need to do.
*/
if (block_for_last && submit == to_submit)
force_nonblock = false;
io_submit_sqe(ctx, &s, statep, &link, force_nonblock);
} }
io_commit_sqring(ctx);
if (link) if (link)
io_queue_link_head(ctx, link, &link->submit, shadow_req, io_queue_link_head(ctx, link, &link->submit, shadow_req);
!block_for_last);
if (statep) if (statep)
io_submit_state_end(statep); io_submit_state_end(statep);
io_commit_sqring(ctx);
return submit; return submit;
} }
...@@ -3636,21 +3666,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ...@@ -3636,21 +3666,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
wake_up(&ctx->sqo_wait); wake_up(&ctx->sqo_wait);
submitted = to_submit; submitted = to_submit;
} else if (to_submit) { } else if (to_submit) {
bool block_for_last = false;
to_submit = min(to_submit, ctx->sq_entries); to_submit = min(to_submit, ctx->sq_entries);
/*
* Allow last submission to block in a series, IFF the caller
* asked to wait for events and we don't currently have
* enough. This potentially avoids an async punt.
*/
if (to_submit == min_complete &&
io_cqring_events(ctx->rings) < min_complete)
block_for_last = true;
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
submitted = io_ring_submit(ctx, to_submit, block_for_last); submitted = io_ring_submit(ctx, to_submit);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
} }
if (flags & IORING_ENTER_GETEVENTS) { if (flags & IORING_ENTER_GETEVENTS) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment