Commit a98278ec authored by Jens Axboe's avatar Jens Axboe

Merge branch 'block-5.9' into for-5.10/block

* block-5.9:
  blk-stat: make q->stats->lock irqsafe
  blk-iocost: ioc_pd_free() shouldn't assume irq disabled
  block: fix locking in bdev_del_partition
  block: release disk reference in hd_struct_free_work
  block: ensure bdi->io_pages is always initialized
  nvme-pci: cancel nvme device request before disabling
  nvme: only use power of two io boundaries
  nvme: fix controller instance leak
  nvmet-fc: Fix a missed _irqsave version of spin_lock in 'nvmet_fc_fod_op_done()'
  nvme: Fix NULL dereference for pci nvme controllers
  nvme-rdma: fix reset hang if controller died in the middle of a reset
  nvme-rdma: fix timeout handler
  nvme-rdma: serialize controller teardown sequences
  nvme-tcp: fix reset hang if controller died in the middle of a reset
  nvme-tcp: fix timeout handler
  nvme-tcp: serialize controller teardown sequences
  nvme: have nvme_wait_freeze_timeout return if it timed out
  nvme-fabrics: don't check state NVME_CTRL_NEW for request acceptance
  nvmet-tcp: Fix NULL dereference when a connect data comes in h2cdata pdu
parents f75aef39 e11d80a8
...@@ -539,6 +539,7 @@ struct request_queue *blk_alloc_queue(int node_id) ...@@ -539,6 +539,7 @@ struct request_queue *blk_alloc_queue(int node_id)
goto fail_stats; goto fail_stats;
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->io_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->node = node_id; q->node = node_id;
......
...@@ -2092,14 +2092,15 @@ static void ioc_pd_free(struct blkg_policy_data *pd) ...@@ -2092,14 +2092,15 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
{ {
struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc_gq *iocg = pd_to_iocg(pd);
struct ioc *ioc = iocg->ioc; struct ioc *ioc = iocg->ioc;
unsigned long flags;
if (ioc) { if (ioc) {
spin_lock(&ioc->lock); spin_lock_irqsave(&ioc->lock, flags);
if (!list_empty(&iocg->active_list)) { if (!list_empty(&iocg->active_list)) {
propagate_active_weight(iocg, 0, 0); propagate_active_weight(iocg, 0, 0);
list_del_init(&iocg->active_list); list_del_init(&iocg->active_list);
} }
spin_unlock(&ioc->lock); spin_unlock_irqrestore(&ioc->lock, flags);
hrtimer_cancel(&iocg->waitq_timer); hrtimer_cancel(&iocg->waitq_timer);
hrtimer_cancel(&iocg->delay_timer); hrtimer_cancel(&iocg->delay_timer);
......
...@@ -137,6 +137,7 @@ void blk_stat_add_callback(struct request_queue *q, ...@@ -137,6 +137,7 @@ void blk_stat_add_callback(struct request_queue *q,
struct blk_stat_callback *cb) struct blk_stat_callback *cb)
{ {
unsigned int bucket; unsigned int bucket;
unsigned long flags;
int cpu; int cpu;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
...@@ -147,20 +148,22 @@ void blk_stat_add_callback(struct request_queue *q, ...@@ -147,20 +148,22 @@ void blk_stat_add_callback(struct request_queue *q,
blk_rq_stat_init(&cpu_stat[bucket]); blk_rq_stat_init(&cpu_stat[bucket]);
} }
spin_lock(&q->stats->lock); spin_lock_irqsave(&q->stats->lock, flags);
list_add_tail_rcu(&cb->list, &q->stats->callbacks); list_add_tail_rcu(&cb->list, &q->stats->callbacks);
blk_queue_flag_set(QUEUE_FLAG_STATS, q); blk_queue_flag_set(QUEUE_FLAG_STATS, q);
spin_unlock(&q->stats->lock); spin_unlock_irqrestore(&q->stats->lock, flags);
} }
void blk_stat_remove_callback(struct request_queue *q, void blk_stat_remove_callback(struct request_queue *q,
struct blk_stat_callback *cb) struct blk_stat_callback *cb)
{ {
spin_lock(&q->stats->lock); unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags);
list_del_rcu(&cb->list); list_del_rcu(&cb->list);
if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q); blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock(&q->stats->lock); spin_unlock_irqrestore(&q->stats->lock, flags);
del_timer_sync(&cb->timer); del_timer_sync(&cb->timer);
} }
...@@ -183,10 +186,12 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) ...@@ -183,10 +186,12 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
void blk_stat_enable_accounting(struct request_queue *q) void blk_stat_enable_accounting(struct request_queue *q)
{ {
spin_lock(&q->stats->lock); unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags);
q->stats->enable_accounting = true; q->stats->enable_accounting = true;
blk_queue_flag_set(QUEUE_FLAG_STATS, q); blk_queue_flag_set(QUEUE_FLAG_STATS, q);
spin_unlock(&q->stats->lock); spin_unlock_irqrestore(&q->stats->lock, flags);
} }
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
......
...@@ -278,6 +278,15 @@ static void hd_struct_free_work(struct work_struct *work) ...@@ -278,6 +278,15 @@ static void hd_struct_free_work(struct work_struct *work)
{ {
struct hd_struct *part = struct hd_struct *part =
container_of(to_rcu_work(work), struct hd_struct, rcu_work); container_of(to_rcu_work(work), struct hd_struct, rcu_work);
struct gendisk *disk = part_to_disk(part);
/*
* Release the disk reference acquired in delete_partition here.
* We can't release it in hd_struct_free because the final put_device
* needs process context and thus can't be run directly from a
* percpu_ref ->release handler.
*/
put_device(disk_to_dev(disk));
part->start_sect = 0; part->start_sect = 0;
part->nr_sects = 0; part->nr_sects = 0;
...@@ -293,7 +302,6 @@ static void hd_struct_free(struct percpu_ref *ref) ...@@ -293,7 +302,6 @@ static void hd_struct_free(struct percpu_ref *ref)
rcu_dereference_protected(disk->part_tbl, 1); rcu_dereference_protected(disk->part_tbl, 1);
rcu_assign_pointer(ptbl->last_lookup, NULL); rcu_assign_pointer(ptbl->last_lookup, NULL);
put_device(disk_to_dev(disk));
INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work);
queue_rcu_work(system_wq, &part->rcu_work); queue_rcu_work(system_wq, &part->rcu_work);
...@@ -524,19 +532,20 @@ int bdev_add_partition(struct block_device *bdev, int partno, ...@@ -524,19 +532,20 @@ int bdev_add_partition(struct block_device *bdev, int partno,
int bdev_del_partition(struct block_device *bdev, int partno) int bdev_del_partition(struct block_device *bdev, int partno)
{ {
struct block_device *bdevp; struct block_device *bdevp;
struct hd_struct *part; struct hd_struct *part = NULL;
int ret = 0; int ret;
part = disk_get_part(bdev->bd_disk, partno); bdevp = bdget_disk(bdev->bd_disk, partno);
if (!part)
return -ENXIO;
ret = -ENOMEM;
bdevp = bdget(part_devt(part));
if (!bdevp) if (!bdevp)
goto out_put_part; return -ENOMEM;
mutex_lock(&bdevp->bd_mutex); mutex_lock(&bdevp->bd_mutex);
mutex_lock_nested(&bdev->bd_mutex, 1);
ret = -ENXIO;
part = disk_get_part(bdev->bd_disk, partno);
if (!part)
goto out_unlock;
ret = -EBUSY; ret = -EBUSY;
if (bdevp->bd_openers) if (bdevp->bd_openers)
...@@ -545,16 +554,14 @@ int bdev_del_partition(struct block_device *bdev, int partno) ...@@ -545,16 +554,14 @@ int bdev_del_partition(struct block_device *bdev, int partno)
sync_blockdev(bdevp); sync_blockdev(bdevp);
invalidate_bdev(bdevp); invalidate_bdev(bdevp);
mutex_lock_nested(&bdev->bd_mutex, 1);
delete_partition(bdev->bd_disk, part); delete_partition(bdev->bd_disk, part);
mutex_unlock(&bdev->bd_mutex);
ret = 0; ret = 0;
out_unlock: out_unlock:
mutex_unlock(&bdev->bd_mutex);
mutex_unlock(&bdevp->bd_mutex); mutex_unlock(&bdevp->bd_mutex);
bdput(bdevp); bdput(bdevp);
out_put_part: if (part)
disk_put_part(part); disk_put_part(part);
return ret; return ret;
} }
......
...@@ -2026,13 +2026,49 @@ static void nvme_update_disk_info(struct gendisk *disk, ...@@ -2026,13 +2026,49 @@ static void nvme_update_disk_info(struct gendisk *disk,
blk_mq_unfreeze_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue);
} }
static inline bool nvme_first_scan(struct gendisk *disk)
{
/* nvme_alloc_ns() scans the disk prior to adding it */
return !(disk->flags & GENHD_FL_UP);
}
static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob;
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
else
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
if (!iob)
return;
if (!is_power_of_2(iob)) {
if (nvme_first_scan(ns->disk))
pr_warn("%s: ignoring unaligned IO boundary:%u\n",
ns->disk->disk_name, iob);
return;
}
if (blk_queue_is_zoned(ns->disk->queue)) {
if (nvme_first_scan(ns->disk))
pr_warn("%s: ignoring zoned namespace IO boundary\n",
ns->disk->disk_name);
return;
}
blk_queue_chunk_sectors(ns->queue, iob);
}
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{ {
unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
struct nvme_ns *ns = disk->private_data; struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_ctrl *ctrl = ns->ctrl;
int ret; int ret;
u32 iob;
/* /*
* If identify namespace failed, use default 512 byte block size so * If identify namespace failed, use default 512 byte block size so
...@@ -2060,12 +2096,6 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ...@@ -2060,12 +2096,6 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
return -ENODEV; return -ENODEV;
} }
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
else
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
ns->features = 0; ns->features = 0;
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
/* the PI implementation requires metadata equal t10 pi tuple size */ /* the PI implementation requires metadata equal t10 pi tuple size */
...@@ -2097,8 +2127,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ...@@ -2097,8 +2127,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
} }
} }
if (iob && !blk_queue_is_zoned(ns->queue)) nvme_set_chunk_sectors(ns, id);
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
nvme_update_disk_info(disk, ns, id); nvme_update_disk_info(disk, ns, id);
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) { if (ns->head->disk) {
...@@ -3676,6 +3705,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, ...@@ -3676,6 +3705,10 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0; return 0;
if (a == &dev_attr_hostid.attr && !ctrl->opts) if (a == &dev_attr_hostid.attr && !ctrl->opts)
return 0; return 0;
if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts)
return 0;
if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts)
return 0;
return a->mode; return a->mode;
} }
...@@ -4390,7 +4423,7 @@ static void nvme_free_ctrl(struct device *dev) ...@@ -4390,7 +4423,7 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_subsystem *subsys = ctrl->subsys; struct nvme_subsystem *subsys = ctrl->subsys;
struct nvme_cel *cel, *next; struct nvme_cel *cel, *next;
if (subsys && ctrl->instance != subsys->instance) if (!subsys || ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_simple_remove(&nvme_instance_ida, ctrl->instance);
list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
...@@ -4534,7 +4567,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl) ...@@ -4534,7 +4567,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl)
} }
EXPORT_SYMBOL_GPL(nvme_unfreeze); EXPORT_SYMBOL_GPL(nvme_unfreeze);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{ {
struct nvme_ns *ns; struct nvme_ns *ns;
...@@ -4545,6 +4578,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) ...@@ -4545,6 +4578,7 @@ void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
break; break;
} }
up_read(&ctrl->namespaces_rwsem); up_read(&ctrl->namespaces_rwsem);
return timeout;
} }
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
......
...@@ -576,7 +576,6 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, ...@@ -576,7 +576,6 @@ bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
* which is require to set the queue live in the appropinquate states. * which is require to set the queue live in the appropinquate states.
*/ */
switch (ctrl->state) { switch (ctrl->state) {
case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING: case NVME_CTRL_CONNECTING:
if (nvme_is_fabrics(req->cmd) && if (nvme_is_fabrics(req->cmd) &&
req->cmd->fabrics.fctype == nvme_fabrics_type_connect) req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
......
...@@ -605,7 +605,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl); ...@@ -605,7 +605,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl); void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
void nvme_start_freeze(struct nvme_ctrl *ctrl); void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1 #define NVME_QID_ANY -1
......
...@@ -1249,8 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) ...@@ -1249,8 +1249,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn_ratelimited(dev->ctrl.device, dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n", "I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid); req->tag, nvmeq->qid);
nvme_dev_disable(dev, true);
nvme_req(req)->flags |= NVME_REQ_CANCELLED; nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, true);
return BLK_EH_DONE; return BLK_EH_DONE;
case NVME_CTRL_RESETTING: case NVME_CTRL_RESETTING:
return BLK_EH_RESET_TIMER; return BLK_EH_RESET_TIMER;
...@@ -1267,10 +1267,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) ...@@ -1267,10 +1267,10 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device, dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n", "I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid); req->tag, nvmeq->qid);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, false); nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl); nvme_reset_ctrl(&dev->ctrl);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_DONE; return BLK_EH_DONE;
} }
......
...@@ -122,6 +122,7 @@ struct nvme_rdma_ctrl { ...@@ -122,6 +122,7 @@ struct nvme_rdma_ctrl {
struct sockaddr_storage src_addr; struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl; struct nvme_ctrl ctrl;
struct mutex teardown_lock;
bool use_inline_data; bool use_inline_data;
u32 io_queues[HCTX_MAX_TYPES]; u32 io_queues[HCTX_MAX_TYPES];
}; };
...@@ -975,7 +976,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ...@@ -975,7 +976,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
if (!new) { if (!new) {
nvme_start_queues(&ctrl->ctrl); nvme_start_queues(&ctrl->ctrl);
nvme_wait_freeze(&ctrl->ctrl); if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
* be stuck. Fail the controller initialization just
* to be safe.
*/
ret = -ENODEV;
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
ctrl->ctrl.queue_count - 1); ctrl->ctrl.queue_count - 1);
nvme_unfreeze(&ctrl->ctrl); nvme_unfreeze(&ctrl->ctrl);
...@@ -983,6 +992,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ...@@ -983,6 +992,9 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
return 0; return 0;
out_wait_freeze_timed_out:
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
out_cleanup_connect_q: out_cleanup_connect_q:
if (new) if (new)
blk_cleanup_queue(ctrl->ctrl.connect_q); blk_cleanup_queue(ctrl->ctrl.connect_q);
...@@ -997,6 +1009,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) ...@@ -997,6 +1009,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool remove) bool remove)
{ {
mutex_lock(&ctrl->teardown_lock);
blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_stop_queue(&ctrl->queues[0]);
if (ctrl->ctrl.admin_tagset) { if (ctrl->ctrl.admin_tagset) {
...@@ -1007,11 +1020,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, ...@@ -1007,11 +1020,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (remove) if (remove)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove); nvme_rdma_destroy_admin_queue(ctrl, remove);
mutex_unlock(&ctrl->teardown_lock);
} }
static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove) bool remove)
{ {
mutex_lock(&ctrl->teardown_lock);
if (ctrl->ctrl.queue_count > 1) { if (ctrl->ctrl.queue_count > 1) {
nvme_start_freeze(&ctrl->ctrl); nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl);
...@@ -1025,6 +1040,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, ...@@ -1025,6 +1040,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
nvme_start_queues(&ctrl->ctrl); nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove); nvme_rdma_destroy_io_queues(ctrl, remove);
} }
mutex_unlock(&ctrl->teardown_lock);
} }
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
...@@ -1180,6 +1196,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl) ...@@ -1180,6 +1196,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return; return;
dev_warn(ctrl->ctrl.device, "starting error recovery\n");
queue_work(nvme_reset_wq, &ctrl->err_work); queue_work(nvme_reset_wq, &ctrl->err_work);
} }
...@@ -1946,6 +1963,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, ...@@ -1946,6 +1963,22 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
return 0; return 0;
} }
static void nvme_rdma_complete_timed_out(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_rdma_queue *queue = req->queue;
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
/* fence other contexts that may complete the command */
mutex_lock(&ctrl->teardown_lock);
nvme_rdma_stop_queue(queue);
if (!blk_mq_request_completed(rq)) {
nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
blk_mq_complete_request(rq);
}
mutex_unlock(&ctrl->teardown_lock);
}
static enum blk_eh_timer_return static enum blk_eh_timer_return
nvme_rdma_timeout(struct request *rq, bool reserved) nvme_rdma_timeout(struct request *rq, bool reserved)
{ {
...@@ -1956,29 +1989,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved) ...@@ -1956,29 +1989,29 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
rq->tag, nvme_rdma_queue_idx(queue)); rq->tag, nvme_rdma_queue_idx(queue));
/*
* Restart the timer if a controller reset is already scheduled. Any
* timed out commands would be handled before entering the connecting
* state.
*/
if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
return BLK_EH_RESET_TIMER;
if (ctrl->ctrl.state != NVME_CTRL_LIVE) { if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
/* /*
* Teardown immediately if controller times out while starting * If we are resetting, connecting or deleting we should
* or we are already started error recovery. all outstanding * complete immediately because we may block controller
* requests are completed on shutdown, so we return BLK_EH_DONE. * teardown or setup sequence
* - ctrl disable/shutdown fabrics requests
* - connect requests
* - initialization admin requests
* - I/O requests that entered after unquiescing and
* the controller stopped responding
*
* All other requests should be cancelled by the error
* recovery work, so it's fine that we fail it here.
*/ */
flush_work(&ctrl->err_work); nvme_rdma_complete_timed_out(rq);
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_rdma_teardown_admin_queue(ctrl, false);
return BLK_EH_DONE; return BLK_EH_DONE;
} }
dev_warn(ctrl->ctrl.device, "starting error recovery\n"); /*
* LIVE state should trigger the normal error recovery which will
* handle completing this request.
*/
nvme_rdma_error_recovery(ctrl); nvme_rdma_error_recovery(ctrl);
return BLK_EH_RESET_TIMER; return BLK_EH_RESET_TIMER;
} }
...@@ -2278,6 +2311,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, ...@@ -2278,6 +2311,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
ctrl->ctrl.opts = opts; ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list); INIT_LIST_HEAD(&ctrl->list);
mutex_init(&ctrl->teardown_lock);
if (!(opts->mask & NVMF_OPT_TRSVCID)) { if (!(opts->mask & NVMF_OPT_TRSVCID)) {
opts->trsvcid = opts->trsvcid =
......
...@@ -124,6 +124,7 @@ struct nvme_tcp_ctrl { ...@@ -124,6 +124,7 @@ struct nvme_tcp_ctrl {
struct sockaddr_storage src_addr; struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl; struct nvme_ctrl ctrl;
struct mutex teardown_lock;
struct work_struct err_work; struct work_struct err_work;
struct delayed_work connect_work; struct delayed_work connect_work;
struct nvme_tcp_request async_req; struct nvme_tcp_request async_req;
...@@ -464,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) ...@@ -464,6 +465,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return; return;
dev_warn(ctrl->device, "starting error recovery\n");
queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
} }
...@@ -1526,7 +1528,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) ...@@ -1526,7 +1528,6 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
return; return;
__nvme_tcp_stop_queue(queue); __nvme_tcp_stop_queue(queue);
} }
...@@ -1781,7 +1782,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ...@@ -1781,7 +1782,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
if (!new) { if (!new) {
nvme_start_queues(ctrl); nvme_start_queues(ctrl);
nvme_wait_freeze(ctrl); if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
* be stuck. Fail the controller initialization just
* to be safe.
*/
ret = -ENODEV;
goto out_wait_freeze_timed_out;
}
blk_mq_update_nr_hw_queues(ctrl->tagset, blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1); ctrl->queue_count - 1);
nvme_unfreeze(ctrl); nvme_unfreeze(ctrl);
...@@ -1789,6 +1798,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) ...@@ -1789,6 +1798,9 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
return 0; return 0;
out_wait_freeze_timed_out:
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
out_cleanup_connect_q: out_cleanup_connect_q:
if (new) if (new)
blk_cleanup_queue(ctrl->connect_q); blk_cleanup_queue(ctrl->connect_q);
...@@ -1874,6 +1886,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) ...@@ -1874,6 +1886,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
bool remove) bool remove)
{ {
mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
blk_mq_quiesce_queue(ctrl->admin_q); blk_mq_quiesce_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0); nvme_tcp_stop_queue(ctrl, 0);
if (ctrl->admin_tagset) { if (ctrl->admin_tagset) {
...@@ -1884,13 +1897,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, ...@@ -1884,13 +1897,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
if (remove) if (remove)
blk_mq_unquiesce_queue(ctrl->admin_q); blk_mq_unquiesce_queue(ctrl->admin_q);
nvme_tcp_destroy_admin_queue(ctrl, remove); nvme_tcp_destroy_admin_queue(ctrl, remove);
mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock);
} }
static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
bool remove) bool remove)
{ {
mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
if (ctrl->queue_count <= 1) if (ctrl->queue_count <= 1)
return; goto out;
blk_mq_quiesce_queue(ctrl->admin_q);
nvme_start_freeze(ctrl); nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl); nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl);
...@@ -1902,6 +1918,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, ...@@ -1902,6 +1918,8 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
if (remove) if (remove)
nvme_start_queues(ctrl); nvme_start_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove); nvme_tcp_destroy_io_queues(ctrl, remove);
out:
mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock);
} }
static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
...@@ -2148,40 +2166,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ...@@ -2148,40 +2166,55 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
nvme_tcp_queue_request(&ctrl->async_req, true, true); nvme_tcp_queue_request(&ctrl->async_req, true, true);
} }
static void nvme_tcp_complete_timed_out(struct request *rq)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
/* fence other contexts that may complete the command */
mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock);
nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
if (!blk_mq_request_completed(rq)) {
nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
blk_mq_complete_request(rq);
}
mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock);
}
static enum blk_eh_timer_return static enum blk_eh_timer_return
nvme_tcp_timeout(struct request *rq, bool reserved) nvme_tcp_timeout(struct request *rq, bool reserved)
{ {
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu; struct nvme_tcp_cmd_pdu *pdu = req->pdu;
/* dev_warn(ctrl->device,
* Restart the timer if a controller reset is already scheduled. Any
* timed out commands would be handled before entering the connecting
* state.
*/
if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
return BLK_EH_RESET_TIMER;
dev_warn(ctrl->ctrl.device,
"queue %d: timeout request %#x type %d\n", "queue %d: timeout request %#x type %d\n",
nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
if (ctrl->ctrl.state != NVME_CTRL_LIVE) { if (ctrl->state != NVME_CTRL_LIVE) {
/* /*
* Teardown immediately if controller times out while starting * If we are resetting, connecting or deleting we should
* or we are already started error recovery. all outstanding * complete immediately because we may block controller
* requests are completed on shutdown, so we return BLK_EH_DONE. * teardown or setup sequence
* - ctrl disable/shutdown fabrics requests
* - connect requests
* - initialization admin requests
* - I/O requests that entered after unquiescing and
* the controller stopped responding
*
* All other requests should be cancelled by the error
* recovery work, so it's fine that we fail it here.
*/ */
flush_work(&ctrl->err_work); nvme_tcp_complete_timed_out(rq);
nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
return BLK_EH_DONE; return BLK_EH_DONE;
} }
dev_warn(ctrl->ctrl.device, "starting error recovery\n"); /*
nvme_tcp_error_recovery(&ctrl->ctrl); * LIVE state should trigger the normal error recovery which will
* handle completing this request.
*/
nvme_tcp_error_recovery(ctrl);
return BLK_EH_RESET_TIMER; return BLK_EH_RESET_TIMER;
} }
...@@ -2422,6 +2455,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, ...@@ -2422,6 +2455,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
nvme_tcp_reconnect_ctrl_work); nvme_tcp_reconnect_ctrl_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
mutex_init(&ctrl->teardown_lock);
if (!(opts->mask & NVMF_OPT_TRSVCID)) { if (!(opts->mask & NVMF_OPT_TRSVCID)) {
opts->trsvcid = opts->trsvcid =
......
...@@ -2342,9 +2342,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) ...@@ -2342,9 +2342,9 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
return; return;
if (fcpreq->fcp_error || if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) { fcpreq->transferred_length != fcpreq->transfer_length) {
spin_lock(&fod->flock); spin_lock_irqsave(&fod->flock, flags);
fod->abort = true; fod->abort = true;
spin_unlock(&fod->flock); spin_unlock_irqrestore(&fod->flock, flags);
nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); nvmet_req_complete(&fod->req, NVME_SC_INTERNAL);
return; return;
......
...@@ -160,6 +160,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); ...@@ -160,6 +160,11 @@ static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
struct nvmet_tcp_cmd *cmd) struct nvmet_tcp_cmd *cmd)
{ {
if (unlikely(!queue->nr_cmds)) {
/* We didn't allocate cmds yet, send 0xffff */
return USHRT_MAX;
}
return cmd - queue->cmds; return cmd - queue->cmds;
} }
...@@ -866,7 +871,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) ...@@ -866,7 +871,10 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvme_tcp_data_pdu *data = &queue->pdu.data;
struct nvmet_tcp_cmd *cmd; struct nvmet_tcp_cmd *cmd;
cmd = &queue->cmds[data->ttag]; if (likely(queue->nr_cmds))
cmd = &queue->cmds[data->ttag];
else
cmd = &queue->connect;
if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
pr_err("ttag %u unexpected data offset %u (expected %u)\n", pr_err("ttag %u unexpected data offset %u (expected %u)\n",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment