Commit b3bd076f authored by Moshe Shemesh's avatar Moshe Shemesh Committed by Saeed Mahameed

net/mlx5: Report devlink health on FW fatal issues

Report devlink health on FW fatal issues via fw_fatal_reporter. The
driver recover flow for FW fatal error is now being handled by the
devlink health.

Having the recovery controlled by devlink health, the user has the
ability to cancel the auto-recovery for debug session and run it
manually.

Call mlx5_enter_error_state() before calling devlink_health_report() to
ensure entering device error state even if auto-recovery is off.
Signed-off-by: default avatarMoshe Shemesh <moshe@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent 9b1f2982
...@@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev) ...@@ -327,19 +327,6 @@ static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
return 0; return 0;
} }
static void health_recover_work(struct work_struct *work)
{
struct mlx5_core_health *health;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
health = container_of(work, struct mlx5_core_health, work);
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);
mlx5_health_try_recover(dev);
}
static const char *hsynd_str(u8 synd) static const char *hsynd_str(u8 synd)
{ {
switch (synd) { switch (synd) {
...@@ -614,6 +601,29 @@ mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter, ...@@ -614,6 +601,29 @@ mlx5_fw_fatal_reporter_dump(struct devlink_health_reporter *reporter,
return err; return err;
} }
static void mlx5_fw_fatal_reporter_err_work(struct work_struct *work)
{
struct mlx5_fw_reporter_ctx fw_reporter_ctx;
struct mlx5_core_health *health;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
health = container_of(work, struct mlx5_core_health, fatal_report_work);
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);
mlx5_enter_error_state(dev, false);
if (IS_ERR_OR_NULL(health->fw_fatal_reporter)) {
if (mlx5_health_try_recover(dev))
mlx5_core_err(dev, "health recovery failed\n");
return;
}
fw_reporter_ctx.err_synd = health->synd;
fw_reporter_ctx.miss_counter = health->miss_counter;
devlink_health_report(health->fw_fatal_reporter,
"FW fatal error reported", &fw_reporter_ctx);
}
static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = { static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
.name = "fw_fatal", .name = "fw_fatal",
.recover = mlx5_fw_fatal_reporter_recover, .recover = mlx5_fw_fatal_reporter_recover,
...@@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev) ...@@ -672,7 +682,7 @@ void mlx5_trigger_health_work(struct mlx5_core_dev *dev)
spin_lock_irqsave(&health->wq_lock, flags); spin_lock_irqsave(&health->wq_lock, flags);
if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
queue_work(health->wq, &health->work); queue_work(health->wq, &health->fatal_report_work);
else else
mlx5_core_err(dev, "new health works are not permitted at this stage\n"); mlx5_core_err(dev, "new health works are not permitted at this stage\n");
spin_unlock_irqrestore(&health->wq_lock, flags); spin_unlock_irqrestore(&health->wq_lock, flags);
...@@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev) ...@@ -758,7 +768,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
spin_unlock_irqrestore(&health->wq_lock, flags); spin_unlock_irqrestore(&health->wq_lock, flags);
cancel_work_sync(&health->report_work); cancel_work_sync(&health->report_work);
cancel_work_sync(&health->work); cancel_work_sync(&health->fatal_report_work);
} }
void mlx5_health_flush(struct mlx5_core_dev *dev) void mlx5_health_flush(struct mlx5_core_dev *dev)
...@@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev) ...@@ -795,7 +805,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
if (!health->wq) if (!health->wq)
goto out_err; goto out_err;
spin_lock_init(&health->wq_lock); spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_recover_work); INIT_WORK(&health->fatal_report_work, mlx5_fw_fatal_reporter_err_work);
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
return 0; return 0;
......
...@@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, ...@@ -1363,11 +1363,8 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
mlx5_enter_error_state(dev, false); mlx5_enter_error_state(dev, false);
mlx5_error_sw_reset(dev); mlx5_error_sw_reset(dev);
mlx5_unload_one(dev, false); mlx5_unload_one(dev, false);
/* In case of kernel call drain the health wq */
if (state) {
mlx5_drain_health_wq(dev); mlx5_drain_health_wq(dev);
mlx5_pci_disable_device(dev); mlx5_pci_disable_device(dev);
}
return state == pci_channel_io_perm_failure ? return state == pci_channel_io_perm_failure ?
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
...@@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table); ...@@ -1535,7 +1532,8 @@ MODULE_DEVICE_TABLE(pci, mlx5_core_pci_table);
void mlx5_disable_device(struct mlx5_core_dev *dev) void mlx5_disable_device(struct mlx5_core_dev *dev)
{ {
mlx5_pci_err_detected(dev->pdev, 0); mlx5_error_sw_reset(dev);
mlx5_unload_one(dev, false);
} }
void mlx5_recover_device(struct mlx5_core_dev *dev) void mlx5_recover_device(struct mlx5_core_dev *dev)
......
...@@ -442,7 +442,7 @@ struct mlx5_core_health { ...@@ -442,7 +442,7 @@ struct mlx5_core_health {
spinlock_t wq_lock; spinlock_t wq_lock;
struct workqueue_struct *wq; struct workqueue_struct *wq;
unsigned long flags; unsigned long flags;
struct work_struct work; struct work_struct fatal_report_work;
struct work_struct report_work; struct work_struct report_work;
struct delayed_work recover_work; struct delayed_work recover_work;
struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_reporter;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment