Commit f0276924 authored by Shaohua Li's avatar Shaohua Li Committed by Jens Axboe

blk-mq: Don't reserve a tag for flush request

Reserving a tag (request) for flush to avoid dead lock is a overkill. A
tag is valuable resource. We can track the number of flush requests and
disallow having too many pending flush requests allocated. With this
patch, blk_mq_alloc_request_pinned() could do a busy nop (but not a dead
loop) if too many pending requests are allocated and new flush request
is allocated. But this should not be a problem, too many pending flush
requests are very rare case.

I verified this can fix the deadlock caused by too many pending flush
requests.
Signed-off-by: default avatarShaohua Li <shli@fusionio.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent d835502f
...@@ -284,9 +284,8 @@ static void mq_flush_work(struct work_struct *work) ...@@ -284,9 +284,8 @@ static void mq_flush_work(struct work_struct *work)
q = container_of(work, struct request_queue, mq_flush_work); q = container_of(work, struct request_queue, mq_flush_work);
/* We don't need set REQ_FLUSH_SEQ, it's for consistency */
rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ, rq = blk_mq_alloc_request(q, WRITE_FLUSH|REQ_FLUSH_SEQ,
__GFP_WAIT|GFP_ATOMIC, true); __GFP_WAIT|GFP_ATOMIC, false);
rq->cmd_type = REQ_TYPE_FS; rq->cmd_type = REQ_TYPE_FS;
rq->end_io = flush_end_io; rq->end_io = flush_end_io;
...@@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq) ...@@ -408,8 +407,11 @@ void blk_insert_flush(struct request *rq)
/* /*
* @policy now records what operations need to be done. Adjust * @policy now records what operations need to be done. Adjust
* REQ_FLUSH and FUA for the driver. * REQ_FLUSH and FUA for the driver.
* We keep REQ_FLUSH for mq to track flush requests. For !FUA,
* we never dispatch the request directly.
*/ */
rq->cmd_flags &= ~REQ_FLUSH; if (rq->cmd_flags & REQ_FUA)
rq->cmd_flags &= ~REQ_FLUSH;
if (!(fflags & REQ_FUA)) if (!(fflags & REQ_FUA))
rq->cmd_flags &= ~REQ_FUA; rq->cmd_flags &= ~REQ_FUA;
......
...@@ -194,9 +194,27 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, ...@@ -194,9 +194,27 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
} }
static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
gfp_t gfp, bool reserved) gfp_t gfp, bool reserved,
int rw)
{ {
return blk_mq_alloc_rq(hctx, gfp, reserved); struct request *req;
bool is_flush = false;
/*
* flush need allocate a request, leave at least one request for
* non-flush IO to avoid deadlock
*/
if ((rw & REQ_FLUSH) && !(rw & REQ_FLUSH_SEQ)) {
if (atomic_inc_return(&hctx->pending_flush) >=
hctx->queue_depth - hctx->reserved_tags - 1) {
atomic_dec(&hctx->pending_flush);
return NULL;
}
is_flush = true;
}
req = blk_mq_alloc_rq(hctx, gfp, reserved);
if (!req && is_flush)
atomic_dec(&hctx->pending_flush);
return req;
} }
static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
...@@ -209,7 +227,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, ...@@ -209,7 +227,7 @@ static struct request *blk_mq_alloc_request_pinned(struct request_queue *q,
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved, rw);
if (rq) { if (rq) {
blk_mq_rq_ctx_init(q, ctx, rq, rw); blk_mq_rq_ctx_init(q, ctx, rq, rw);
break; break;
...@@ -272,6 +290,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, ...@@ -272,6 +290,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
const int tag = rq->tag; const int tag = rq->tag;
struct request_queue *q = rq->q; struct request_queue *q = rq->q;
if ((rq->cmd_flags & REQ_FLUSH) && !(rq->cmd_flags & REQ_FLUSH_SEQ))
atomic_dec(&hctx->pending_flush);
blk_mq_rq_init(hctx, rq); blk_mq_rq_init(hctx, rq);
blk_mq_put_tag(hctx->tags, tag); blk_mq_put_tag(hctx->tags, tag);
...@@ -900,14 +921,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) ...@@ -900,14 +921,14 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
hctx = q->mq_ops->map_queue(q, ctx->cpu); hctx = q->mq_ops->map_queue(q, ctx->cpu);
trace_block_getrq(q, bio, rw); trace_block_getrq(q, bio, rw);
rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false, bio->bi_rw);
if (likely(rq)) if (likely(rq))
blk_mq_rq_ctx_init(q, ctx, rq, rw); blk_mq_rq_ctx_init(q, ctx, rq, bio->bi_rw);
else { else {
blk_mq_put_ctx(ctx); blk_mq_put_ctx(ctx);
trace_block_sleeprq(q, bio, rw); trace_block_sleeprq(q, bio, rw);
rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, rq = blk_mq_alloc_request_pinned(q, bio->bi_rw,
false); __GFP_WAIT|GFP_ATOMIC, false);
ctx = rq->mq_ctx; ctx = rq->mq_ctx;
hctx = q->mq_ops->map_queue(q, ctx->cpu); hctx = q->mq_ops->map_queue(q, ctx->cpu);
} }
...@@ -1184,7 +1205,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q, ...@@ -1184,7 +1205,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
hctx->queue_num = i; hctx->queue_num = i;
hctx->flags = reg->flags; hctx->flags = reg->flags;
hctx->queue_depth = reg->queue_depth; hctx->queue_depth = reg->queue_depth;
hctx->reserved_tags = reg->reserved_tags;
hctx->cmd_size = reg->cmd_size; hctx->cmd_size = reg->cmd_size;
atomic_set(&hctx->pending_flush, 0);
blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
blk_mq_hctx_notify, hctx); blk_mq_hctx_notify, hctx);
...@@ -1309,15 +1332,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, ...@@ -1309,15 +1332,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
reg->queue_depth = BLK_MQ_MAX_DEPTH; reg->queue_depth = BLK_MQ_MAX_DEPTH;
} }
/*
* Set aside a tag for flush requests. It will only be used while
* another flush request is in progress but outside the driver.
*
* TODO: only allocate if flushes are supported
*/
reg->queue_depth++;
reg->reserved_tags++;
if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
......
...@@ -36,12 +36,15 @@ struct blk_mq_hw_ctx { ...@@ -36,12 +36,15 @@ struct blk_mq_hw_ctx {
struct list_head page_list; struct list_head page_list;
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
atomic_t pending_flush;
unsigned long queued; unsigned long queued;
unsigned long run; unsigned long run;
#define BLK_MQ_MAX_DISPATCH_ORDER 10 #define BLK_MQ_MAX_DISPATCH_ORDER 10
unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
unsigned int queue_depth; unsigned int queue_depth;
unsigned int reserved_tags;
unsigned int numa_node; unsigned int numa_node;
unsigned int cmd_size; /* per-request extra data */ unsigned int cmd_size; /* per-request extra data */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment