Commit 96c82cdf authored by Moshe Shemesh's avatar Moshe Shemesh Committed by Saeed Mahameed

net/mlx5: Add fw fatal devlink_health_reporter

Create mlx5_devlink_health_reporter for fw fatal reporter.
The fw fatal reporter is added in addition to the fw reporter and
implements the recover callback.
The point of having two reporters for FW issues, is that we
don't want to run FW recover on any issue, but only fatal ones.
Signed-off-by: default avatarMoshe Shemesh <moshe@mellanox.com>
Signed-off-by: default avatarEran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent d1bf0e2c
...@@ -301,31 +301,43 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) ...@@ -301,31 +301,43 @@ static void mlx5_handle_bad_state(struct mlx5_core_dev *dev)
/* How much time to wait until health resetting the driver (in msecs) */ /* How much time to wait until health resetting the driver (in msecs) */
#define MLX5_RECOVERY_WAIT_MSECS 60000 #define MLX5_RECOVERY_WAIT_MSECS 60000
static void health_care(struct work_struct *work) static int mlx5_health_try_recover(struct mlx5_core_dev *dev)
{ {
struct mlx5_core_health *health;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
unsigned long end; unsigned long end;
health = container_of(work, struct mlx5_core_health, work);
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);
mlx5_core_warn(dev, "handling bad device here\n"); mlx5_core_warn(dev, "handling bad device here\n");
mlx5_handle_bad_state(dev); mlx5_handle_bad_state(dev);
end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS); end = jiffies + msecs_to_jiffies(MLX5_RECOVERY_WAIT_MSECS);
while (sensor_pci_not_working(dev)) { while (sensor_pci_not_working(dev)) {
if (time_after(jiffies, end)) { if (time_after(jiffies, end)) {
mlx5_core_err(dev, mlx5_core_err(dev,
"health recovery flow aborted, PCI reads still not working\n"); "health recovery flow aborted, PCI reads still not working\n");
return; return -EIO;
} }
msleep(100); msleep(100);
} }
mlx5_core_err(dev, "starting health recovery flow\n"); mlx5_core_err(dev, "starting health recovery flow\n");
mlx5_recover_device(dev); mlx5_recover_device(dev);
if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state) ||
check_fatal_sensors(dev)) {
mlx5_core_err(dev, "health recovery failed\n");
return -EIO;
}
return 0;
}
static void health_recover_work(struct work_struct *work)
{
struct mlx5_core_health *health;
struct mlx5_core_dev *dev;
struct mlx5_priv *priv;
health = container_of(work, struct mlx5_core_health, work);
priv = container_of(health, struct mlx5_priv, health);
dev = container_of(priv, struct mlx5_core_dev, priv);
mlx5_health_try_recover(dev);
} }
static const char *hsynd_str(u8 synd) static const char *hsynd_str(u8 synd)
...@@ -544,7 +556,22 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { ...@@ -544,7 +556,22 @@ static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
.dump = mlx5_fw_reporter_dump, .dump = mlx5_fw_reporter_dump,
}; };
static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) static int
mlx5_fw_fatal_reporter_recover(struct devlink_health_reporter *reporter,
void *priv_ctx)
{
struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
return mlx5_health_try_recover(dev);
}
static const struct devlink_health_reporter_ops mlx5_fw_fatal_reporter_ops = {
.name = "fw_fatal",
.recover = mlx5_fw_fatal_reporter_recover,
};
#define MLX5_REPORTER_FW_GRACEFUL_PERIOD 1200000
static void mlx5_fw_reporters_create(struct mlx5_core_dev *dev)
{ {
struct mlx5_core_health *health = &dev->priv.health; struct mlx5_core_health *health = &dev->priv.health;
struct devlink *devlink = priv_to_devlink(dev); struct devlink *devlink = priv_to_devlink(dev);
...@@ -555,16 +582,26 @@ static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) ...@@ -555,16 +582,26 @@ static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
if (IS_ERR(health->fw_reporter)) if (IS_ERR(health->fw_reporter))
mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n", mlx5_core_warn(dev, "Failed to create fw reporter, err = %ld\n",
PTR_ERR(health->fw_reporter)); PTR_ERR(health->fw_reporter));
health->fw_fatal_reporter =
devlink_health_reporter_create(devlink,
&mlx5_fw_fatal_reporter_ops,
MLX5_REPORTER_FW_GRACEFUL_PERIOD,
true, dev);
if (IS_ERR(health->fw_fatal_reporter))
mlx5_core_warn(dev, "Failed to create fw fatal reporter, err = %ld\n",
PTR_ERR(health->fw_fatal_reporter));
} }
static void mlx5_fw_reporter_destroy(struct mlx5_core_dev *dev) static void mlx5_fw_reporters_destroy(struct mlx5_core_dev *dev)
{ {
struct mlx5_core_health *health = &dev->priv.health; struct mlx5_core_health *health = &dev->priv.health;
if (IS_ERR_OR_NULL(health->fw_reporter)) if (!IS_ERR_OR_NULL(health->fw_reporter))
return;
devlink_health_reporter_destroy(health->fw_reporter); devlink_health_reporter_destroy(health->fw_reporter);
if (!IS_ERR_OR_NULL(health->fw_fatal_reporter))
devlink_health_reporter_destroy(health->fw_fatal_reporter);
} }
static unsigned long get_next_poll_jiffies(void) static unsigned long get_next_poll_jiffies(void)
...@@ -686,7 +723,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev) ...@@ -686,7 +723,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev)
struct mlx5_core_health *health = &dev->priv.health; struct mlx5_core_health *health = &dev->priv.health;
destroy_workqueue(health->wq); destroy_workqueue(health->wq);
mlx5_fw_reporter_destroy(dev); mlx5_fw_reporters_destroy(dev);
} }
int mlx5_health_init(struct mlx5_core_dev *dev) int mlx5_health_init(struct mlx5_core_dev *dev)
...@@ -694,22 +731,26 @@ int mlx5_health_init(struct mlx5_core_dev *dev) ...@@ -694,22 +731,26 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
struct mlx5_core_health *health; struct mlx5_core_health *health;
char *name; char *name;
mlx5_fw_reporters_create(dev);
health = &dev->priv.health; health = &dev->priv.health;
name = kmalloc(64, GFP_KERNEL); name = kmalloc(64, GFP_KERNEL);
if (!name) if (!name)
return -ENOMEM; goto out_err;
strcpy(name, "mlx5_health"); strcpy(name, "mlx5_health");
strcat(name, dev_name(dev->device)); strcat(name, dev_name(dev->device));
health->wq = create_singlethread_workqueue(name); health->wq = create_singlethread_workqueue(name);
kfree(name); kfree(name);
if (!health->wq) if (!health->wq)
return -ENOMEM; goto out_err;
spin_lock_init(&health->wq_lock); spin_lock_init(&health->wq_lock);
INIT_WORK(&health->work, health_care); INIT_WORK(&health->work, health_recover_work);
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work); INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
mlx5_fw_reporter_create(dev);
return 0; return 0;
out_err:
mlx5_fw_reporters_destroy(dev);
return -ENOMEM;
} }
...@@ -446,6 +446,7 @@ struct mlx5_core_health { ...@@ -446,6 +446,7 @@ struct mlx5_core_health {
struct work_struct report_work; struct work_struct report_work;
struct delayed_work recover_work; struct delayed_work recover_work;
struct devlink_health_reporter *fw_reporter; struct devlink_health_reporter *fw_reporter;
struct devlink_health_reporter *fw_fatal_reporter;
}; };
struct mlx5_qp_table { struct mlx5_qp_table {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment