Commit f7f70f4a authored by Ruozhu Li's avatar Ruozhu Li Committed by Christoph Hellwig

nvme: fix regression when disconnect a recovering ctrl

We encountered a problem that the disconnect command hangs.
After analyzing the log and stack, we found that the triggering
process is as follows:
CPU0                          CPU1
                                nvme_rdma_error_recovery_work
                                  nvme_rdma_teardown_io_queues
nvme_do_delete_ctrl                 nvme_stop_queues
  nvme_remove_namespaces
  --clear ctrl->namespaces
                                    nvme_start_queues
                                    --no ns in ctrl->namespaces
    nvme_ns_remove                  return(because ctrl is deleting)
      blk_freeze_queue
        blk_mq_freeze_queue_wait
        --wait for ns to unquiesce to clean infligt IO, hang forever

This problem was not found in older kernels because we will flush
err work in nvme_stop_ctrl before nvme_remove_namespaces.It does not
seem to be modified for functional reasons, the patch can be revert
to solve the problem.

Revert commit 794a4cb3 ("nvme: remove the .stop_ctrl callout")
Signed-off-by: default avatarRuozhu Li <liruozhu@huawei.com>
Reviewed-by: default avatarSagi Grimberg <sagi@grimberg.me>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent 1629de0e
...@@ -4595,6 +4595,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl) ...@@ -4595,6 +4595,8 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
nvme_stop_failfast_work(ctrl); nvme_stop_failfast_work(ctrl);
flush_work(&ctrl->async_event_work); flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work); cancel_work_sync(&ctrl->fw_act_work);
if (ctrl->ops->stop_ctrl)
ctrl->ops->stop_ctrl(ctrl);
} }
EXPORT_SYMBOL_GPL(nvme_stop_ctrl); EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
......
...@@ -502,6 +502,7 @@ struct nvme_ctrl_ops { ...@@ -502,6 +502,7 @@ struct nvme_ctrl_ops {
void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*free_ctrl)(struct nvme_ctrl *ctrl);
void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl);
void (*stop_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
void (*print_device_info)(struct nvme_ctrl *ctrl); void (*print_device_info)(struct nvme_ctrl *ctrl);
}; };
......
...@@ -1048,6 +1048,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, ...@@ -1048,6 +1048,14 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
} }
} }
static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
}
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{ {
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
...@@ -2252,9 +2260,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { ...@@ -2252,9 +2260,6 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{ {
cancel_work_sync(&ctrl->err_work);
cancel_delayed_work_sync(&ctrl->reconnect_work);
nvme_rdma_teardown_io_queues(ctrl, shutdown); nvme_rdma_teardown_io_queues(ctrl, shutdown);
nvme_stop_admin_queue(&ctrl->ctrl); nvme_stop_admin_queue(&ctrl->ctrl);
if (shutdown) if (shutdown)
...@@ -2304,6 +2309,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { ...@@ -2304,6 +2309,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
.submit_async_event = nvme_rdma_submit_async_event, .submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl, .delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address, .get_address = nvmf_get_address,
.stop_ctrl = nvme_rdma_stop_ctrl,
}; };
/* /*
......
...@@ -2193,9 +2193,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) ...@@ -2193,9 +2193,6 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{ {
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
nvme_tcp_teardown_io_queues(ctrl, shutdown); nvme_tcp_teardown_io_queues(ctrl, shutdown);
nvme_stop_admin_queue(ctrl); nvme_stop_admin_queue(ctrl);
if (shutdown) if (shutdown)
...@@ -2235,6 +2232,12 @@ static void nvme_reset_ctrl_work(struct work_struct *work) ...@@ -2235,6 +2232,12 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
nvme_tcp_reconnect_or_remove(ctrl); nvme_tcp_reconnect_or_remove(ctrl);
} }
static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
{
cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
}
static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl) static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
{ {
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
...@@ -2556,6 +2559,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { ...@@ -2556,6 +2559,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.submit_async_event = nvme_tcp_submit_async_event, .submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl, .delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvmf_get_address, .get_address = nvmf_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl,
}; };
static bool static bool
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment