Commit d3117c83 authored by Quinn Tran's avatar Quinn Tran Committed by Martin K. Petersen

scsi: qla2xxx: Wind down adapter after PCIe error

Put adapter into a wind down state if OS does not make any attempt to
recover the adapter after PCIe error.

Link: https://lore.kernel.org/r/20220616053508.27186-4-njavali@marvell.com
Cc: stable@vger.kernel.org
Signed-off-by: default avatarQuinn Tran <qutran@marvell.com>
Signed-off-by: default avatarNilesh Javali <njavali@marvell.com>
Signed-off-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
parent 476da8fa
...@@ -3061,6 +3061,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job) ...@@ -3061,6 +3061,13 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n", ql_log(ql_log_info, vha, 0x708b, "%s CMD timeout. bsg ptr %p.\n",
__func__, bsg_job); __func__, bsg_job);
if (qla2x00_isp_reg_stat(ha)) {
ql_log(ql_log_info, vha, 0x9007,
"PCI/Register disconnect.\n");
qla_pci_set_eeh_busy(vha);
}
/* find the bsg job from the active list of commands */ /* find the bsg job from the active list of commands */
spin_lock_irqsave(&ha->hardware_lock, flags); spin_lock_irqsave(&ha->hardware_lock, flags);
for (que = 0; que < ha->max_req_queues; que++) { for (que = 0; que < ha->max_req_queues; que++) {
...@@ -3078,7 +3085,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job) ...@@ -3078,7 +3085,8 @@ qla24xx_bsg_timeout(struct bsg_job *bsg_job)
sp->u.bsg_job == bsg_job) { sp->u.bsg_job == bsg_job) {
req->outstanding_cmds[cnt] = NULL; req->outstanding_cmds[cnt] = NULL;
spin_unlock_irqrestore(&ha->hardware_lock, flags); spin_unlock_irqrestore(&ha->hardware_lock, flags);
if (ha->isp_ops->abort_command(sp)) {
if (!ha->flags.eeh_busy && ha->isp_ops->abort_command(sp)) {
ql_log(ql_log_warn, vha, 0x7089, ql_log(ql_log_warn, vha, 0x7089,
"mbx abort_command failed.\n"); "mbx abort_command failed.\n");
bsg_reply->result = -EIO; bsg_reply->result = -EIO;
......
...@@ -4053,6 +4053,9 @@ struct qla_hw_data { ...@@ -4053,6 +4053,9 @@ struct qla_hw_data {
uint32_t n2n_fw_acc_sec:1; uint32_t n2n_fw_acc_sec:1;
uint32_t plogi_template_valid:1; uint32_t plogi_template_valid:1;
uint32_t port_isolated:1; uint32_t port_isolated:1;
uint32_t eeh_flush:2;
#define EEH_FLUSH_RDY 1
#define EEH_FLUSH_DONE 2
} flags; } flags;
uint16_t max_exchg; uint16_t max_exchg;
...@@ -4087,6 +4090,7 @@ struct qla_hw_data { ...@@ -4087,6 +4090,7 @@ struct qla_hw_data {
uint32_t rsp_que_len; uint32_t rsp_que_len;
uint32_t req_que_off; uint32_t req_que_off;
uint32_t rsp_que_off; uint32_t rsp_que_off;
unsigned long eeh_jif;
/* Multi queue data structs */ /* Multi queue data structs */
device_reg_t *mqiobase; device_reg_t *mqiobase;
......
...@@ -47,6 +47,7 @@ qla2x00_sp_timeout(struct timer_list *t) ...@@ -47,6 +47,7 @@ qla2x00_sp_timeout(struct timer_list *t)
{ {
srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer); srb_t *sp = from_timer(sp, t, u.iocb_cmd.timer);
struct srb_iocb *iocb; struct srb_iocb *iocb;
scsi_qla_host_t *vha = sp->vha;
WARN_ON(irqs_disabled()); WARN_ON(irqs_disabled());
iocb = &sp->u.iocb_cmd; iocb = &sp->u.iocb_cmd;
...@@ -54,6 +55,12 @@ qla2x00_sp_timeout(struct timer_list *t) ...@@ -54,6 +55,12 @@ qla2x00_sp_timeout(struct timer_list *t)
/* ref: TMR */ /* ref: TMR */
kref_put(&sp->cmd_kref, qla2x00_sp_release); kref_put(&sp->cmd_kref, qla2x00_sp_release);
if (vha && qla2x00_isp_reg_stat(vha->hw)) {
ql_log(ql_log_info, vha, 0x9008,
"PCI/Register disconnect.\n");
qla_pci_set_eeh_busy(vha);
}
} }
void qla2x00_sp_free(srb_t *sp) void qla2x00_sp_free(srb_t *sp)
...@@ -9671,6 +9678,12 @@ int qla2xxx_disable_port(struct Scsi_Host *host) ...@@ -9671,6 +9678,12 @@ int qla2xxx_disable_port(struct Scsi_Host *host)
vha->hw->flags.port_isolated = 1; vha->hw->flags.port_isolated = 1;
if (qla2x00_isp_reg_stat(vha->hw)) {
ql_log(ql_log_info, vha, 0x9006,
"PCI/Register disconnect, exiting.\n");
qla_pci_set_eeh_busy(vha);
return FAILED;
}
if (qla2x00_chip_is_down(vha)) if (qla2x00_chip_is_down(vha))
return 0; return 0;
...@@ -9686,6 +9699,13 @@ int qla2xxx_enable_port(struct Scsi_Host *host) ...@@ -9686,6 +9699,13 @@ int qla2xxx_enable_port(struct Scsi_Host *host)
{ {
scsi_qla_host_t *vha = shost_priv(host); scsi_qla_host_t *vha = shost_priv(host);
if (qla2x00_isp_reg_stat(vha->hw)) {
ql_log(ql_log_info, vha, 0x9001,
"PCI/Register disconnect, exiting.\n");
qla_pci_set_eeh_busy(vha);
return FAILED;
}
vha->hw->flags.port_isolated = 0; vha->hw->flags.port_isolated = 0;
/* Set the flag to 1, so that isp_abort can proceed */ /* Set the flag to 1, so that isp_abort can proceed */
vha->flags.online = 1; vha->flags.online = 1;
......
...@@ -333,6 +333,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme, ...@@ -333,6 +333,11 @@ MODULE_PARM_DESC(ql2xabts_wait_nvme,
"To wait for ABTS response on I/O timeouts for NVMe. (default: 1)"); "To wait for ABTS response on I/O timeouts for NVMe. (default: 1)");
u32 ql2xdelay_before_pci_error_handling = 5;
module_param(ql2xdelay_before_pci_error_handling, uint, 0644);
MODULE_PARM_DESC(ql2xdelay_before_pci_error_handling,
"Number of seconds delayed before qla begin PCI error self-handling (default: 5).\n");
static void qla2x00_clear_drv_active(struct qla_hw_data *); static void qla2x00_clear_drv_active(struct qla_hw_data *);
static void qla2x00_free_device(scsi_qla_host_t *); static void qla2x00_free_device(scsi_qla_host_t *);
static int qla2xxx_map_queues(struct Scsi_Host *shost); static int qla2xxx_map_queues(struct Scsi_Host *shost);
...@@ -7238,6 +7243,44 @@ static void qla_heart_beat(struct scsi_qla_host *vha, u16 dpc_started) ...@@ -7238,6 +7243,44 @@ static void qla_heart_beat(struct scsi_qla_host *vha, u16 dpc_started)
} }
} }
static void qla_wind_down_chip(scsi_qla_host_t *vha)
{
struct qla_hw_data *ha = vha->hw;
if (!ha->flags.eeh_busy)
return;
if (ha->pci_error_state)
/* system is trying to recover */
return;
/*
* Current system is not handling PCIE error. At this point, this is
* best effort to wind down the adapter.
*/
if (time_after_eq(jiffies, ha->eeh_jif + ql2xdelay_before_pci_error_handling * HZ) &&
!ha->flags.eeh_flush) {
ql_log(ql_log_info, vha, 0x9009,
"PCI Error detected, attempting to reset hardware.\n");
ha->isp_ops->reset_chip(vha);
ha->isp_ops->disable_intrs(ha);
ha->flags.eeh_flush = EEH_FLUSH_RDY;
ha->eeh_jif = jiffies;
} else if (ha->flags.eeh_flush == EEH_FLUSH_RDY &&
time_after_eq(jiffies, ha->eeh_jif + 5 * HZ)) {
pci_clear_master(ha->pdev);
/* flush all command */
qla2x00_abort_isp_cleanup(vha);
ha->flags.eeh_flush = EEH_FLUSH_DONE;
ql_log(ql_log_info, vha, 0x900a,
"PCI Error handling complete, all IOs aborted.\n");
}
}
/************************************************************************** /**************************************************************************
* qla2x00_timer * qla2x00_timer
* *
...@@ -7261,6 +7304,8 @@ qla2x00_timer(struct timer_list *t) ...@@ -7261,6 +7304,8 @@ qla2x00_timer(struct timer_list *t)
fc_port_t *fcport = NULL; fc_port_t *fcport = NULL;
if (ha->flags.eeh_busy) { if (ha->flags.eeh_busy) {
qla_wind_down_chip(vha);
ql_dbg(ql_dbg_timer, vha, 0x6000, ql_dbg(ql_dbg_timer, vha, 0x6000,
"EEH = %d, restarting timer.\n", "EEH = %d, restarting timer.\n",
ha->flags.eeh_busy); ha->flags.eeh_busy);
...@@ -7841,6 +7886,9 @@ void qla_pci_set_eeh_busy(struct scsi_qla_host *vha) ...@@ -7841,6 +7886,9 @@ void qla_pci_set_eeh_busy(struct scsi_qla_host *vha)
spin_lock_irqsave(&base_vha->work_lock, flags); spin_lock_irqsave(&base_vha->work_lock, flags);
if (!ha->flags.eeh_busy) { if (!ha->flags.eeh_busy) {
ha->eeh_jif = jiffies;
ha->flags.eeh_flush = 0;
ha->flags.eeh_busy = 1; ha->flags.eeh_busy = 1;
do_cleanup = true; do_cleanup = true;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment