Commit e6fc4649 authored by Ming Lei's avatar Ming Lei Committed by Jens Axboe

blk-mq: avoid starving tag allocation after allocating process migrates

When the allocation process is scheduled back and the mapped hw queue is
changed, fake one extra wake up on previous queue for compensating wake
up miss, so other allocations on the previous queue won't be starved.

This patch fixes one request allocation hang issue, which can be
triggered easily in case of very low nr_request.

The race is as follows:

1) 2 hw queues, nr_requests are 2, and wake_batch is one

2) there are 3 waiters on hw queue 0

3) two in-flight requests in hw queue 0 are completed, and only two
   waiters of 3 are waken up because of wake_batch, but both the two
   waiters can be scheduled to another CPU and cause to switch to hw
   queue 1

4) then the 3rd waiter will wait for ever, since no in-flight request
   is in hw queue 0 any more.

5) this patch fixes it by the fake wakeup when waiter is scheduled to
   another hw queue

Cc: <stable@vger.kernel.org>
Reviewed-by: default avatarOmar Sandoval <osandov@fb.com>
Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>

Modified commit message to make it clearer, and make it apply on
top of the 4.18 branch.
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent f1834646
...@@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -134,6 +134,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx); ws = bt_wait_ptr(bt, data->hctx);
drop_ctx = data->ctx == NULL; drop_ctx = data->ctx == NULL;
do { do {
struct sbitmap_queue *bt_prev;
/* /*
* We're out of tags on this hardware queue, kick any * We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for * pending IO submits before going to sleep waiting for
...@@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -159,6 +161,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (data->ctx) if (data->ctx)
blk_mq_put_ctx(data->ctx); blk_mq_put_ctx(data->ctx);
bt_prev = bt;
io_schedule(); io_schedule();
data->ctx = blk_mq_get_ctx(data->q); data->ctx = blk_mq_get_ctx(data->q);
...@@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -170,6 +173,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
bt = &tags->bitmap_tags; bt = &tags->bitmap_tags;
finish_wait(&ws->wait, &wait); finish_wait(&ws->wait, &wait);
/*
* If destination hw queue is changed, fake wake up on
* previous queue for compensating the wake up miss, so
* other allocations on previous queue won't be starved.
*/
if (bt != bt_prev)
sbitmap_queue_wake_up(bt_prev);
ws = bt_wait_ptr(bt, data->hctx); ws = bt_wait_ptr(bt, data->hctx);
} while (1); } while (1);
......
...@@ -512,6 +512,13 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, ...@@ -512,6 +512,13 @@ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
*/ */
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);
/**
* sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
* on a &struct sbitmap_queue.
* @sbq: Bitmap queue to wake up.
*/
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
/** /**
* sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
* seq_file. * seq_file.
......
...@@ -352,8 +352,9 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq, ...@@ -352,8 +352,9 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
if (sbq->wake_batch != wake_batch) { if (sbq->wake_batch != wake_batch) {
WRITE_ONCE(sbq->wake_batch, wake_batch); WRITE_ONCE(sbq->wake_batch, wake_batch);
/* /*
* Pairs with the memory barrier in sbq_wake_up() to ensure that * Pairs with the memory barrier in sbitmap_queue_wake_up()
* the batch size is updated before the wait counts. * to ensure that the batch size is updated before the wait
* counts.
*/ */
smp_mb__before_atomic(); smp_mb__before_atomic();
for (i = 0; i < SBQ_WAIT_QUEUES; i++) for (i = 0; i < SBQ_WAIT_QUEUES; i++)
...@@ -463,15 +464,6 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq) ...@@ -463,15 +464,6 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
unsigned int wake_batch; unsigned int wake_batch;
int wait_cnt; int wait_cnt;
/*
* Pairs with the memory barrier in set_current_state() to ensure the
* proper ordering of clear_bit()/waitqueue_active() in the waker and
* test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
* waiter. See the comment on waitqueue_active(). This is __after_atomic
* because we just did clear_bit_unlock() in the caller.
*/
smp_mb__after_atomic();
ws = sbq_wake_ptr(sbq); ws = sbq_wake_ptr(sbq);
if (!ws) if (!ws)
return false; return false;
...@@ -507,17 +499,26 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq) ...@@ -507,17 +499,26 @@ static bool __sbq_wake_up(struct sbitmap_queue *sbq)
return false; return false;
} }
static void sbq_wake_up(struct sbitmap_queue *sbq) void sbitmap_queue_wake_up(struct sbitmap_queue *sbq)
{ {
while (__sbq_wake_up(sbq)) while (__sbq_wake_up(sbq))
; ;
} }
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
unsigned int cpu) unsigned int cpu)
{ {
sbitmap_clear_bit_unlock(&sbq->sb, nr); sbitmap_clear_bit_unlock(&sbq->sb, nr);
sbq_wake_up(sbq); /*
* Pairs with the memory barrier in set_current_state() to ensure the
* proper ordering of clear_bit_unlock()/waitqueue_active() in the waker
* and test_and_set_bit_lock()/prepare_to_wait()/finish_wait() in the
* waiter. See the comment on waitqueue_active().
*/
smp_mb__after_atomic();
sbitmap_queue_wake_up(sbq);
if (likely(!sbq->round_robin && nr < sbq->sb.depth)) if (likely(!sbq->round_robin && nr < sbq->sb.depth))
*per_cpu_ptr(sbq->alloc_hint, cpu) = nr; *per_cpu_ptr(sbq->alloc_hint, cpu) = nr;
} }
...@@ -529,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) ...@@ -529,7 +530,7 @@ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq)
/* /*
* Pairs with the memory barrier in set_current_state() like in * Pairs with the memory barrier in set_current_state() like in
* sbq_wake_up(). * sbitmap_queue_wake_up().
*/ */
smp_mb(); smp_mb();
wake_index = atomic_read(&sbq->wake_index); wake_index = atomic_read(&sbq->wake_index);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment