Commit 37ca95e6 authored by Gavin Li's avatar Gavin Li Committed by Saeed Mahameed

net/mlx5: Increase FW pre-init timeout for health recovery

Currently, health recovery will reload driver to recover it from fatal
errors. During the driver's load process, it would wait for FW to set the
pre-init bit for up to 120 seconds, beyond this threshold it would abort
the load process. In some cases, such as a FW upgrade on the DPU, this
timeout period is insufficient, and the user has no way to recover the
host device.

To solve this issue, introduce a new FW pre-init timeout for health
recovery, which is set to 2 hours.

The timeout for devlink reload and probe will use the original one because
they are user triggered flows, and therefore should not have a
significantly long timeout, during which the user command would hang.
Signed-off-by: default avatarGavin Li <gavinl@nvidia.com>
Reviewed-by: default avatarMoshe Shemesh <moshe@nvidia.com>
Reviewed-by: default avatarShay Drory <shayd@nvidia.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@nvidia.com>
parent 8324a02c
......@@ -178,13 +178,13 @@ static int mlx5_devlink_reload_up(struct devlink *devlink, enum devlink_reload_a
*actions_performed = BIT(action);
switch (action) {
case DEVLINK_RELOAD_ACTION_DRIVER_REINIT:
return mlx5_load_one(dev);
return mlx5_load_one(dev, false);
case DEVLINK_RELOAD_ACTION_FW_ACTIVATE:
if (limit == DEVLINK_RELOAD_LIMIT_NO_RESET)
break;
/* On fw_activate action, also driver is reloaded and reinit performed */
*actions_performed |= BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
return mlx5_load_one(dev);
return mlx5_load_one(dev, false);
default:
/* Unsupported action should not get to this function */
WARN_ON(1);
......
......@@ -148,7 +148,7 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
complete(&fw_reset->done);
} else {
mlx5_load_one(dev);
mlx5_load_one(dev, false);
devlink_remote_reload_actions_performed(priv_to_devlink(dev), 0,
BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
......
......@@ -10,6 +10,7 @@ struct mlx5_timeouts {
static const u32 tout_def_sw_val[MAX_TIMEOUT_TYPES] = {
[MLX5_TO_FW_PRE_INIT_TIMEOUT_MS] = 120000,
[MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS] = 7200000,
[MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS] = 20000,
[MLX5_TO_FW_PRE_INIT_WAIT_MS] = 2,
[MLX5_TO_FW_INIT_MS] = 2000,
......
......@@ -7,6 +7,7 @@
enum mlx5_timeouts_types {
/* pre init timeouts (not read from FW) */
MLX5_TO_FW_PRE_INIT_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_ON_RECOVERY_TIMEOUT_MS,
MLX5_TO_FW_PRE_INIT_WARN_MESSAGE_INTERVAL_MS,
MLX5_TO_FW_PRE_INIT_WAIT_MS,
......
......@@ -1003,7 +1003,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_devcom_unregister_device(dev->priv.devcom);
}
static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
{
int err;
......@@ -1018,11 +1018,11 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
/* wait for firmware to accept initialization segments configurations
*/
err = wait_fw_init(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT),
err = wait_fw_init(dev, timeout,
mlx5_tout_ms(dev, FW_PRE_INIT_WARN_MESSAGE_INTERVAL));
if (err) {
mlx5_core_err(dev, "Firmware over %llu MS in pre-initializing state, aborting\n",
mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
timeout);
return err;
}
......@@ -1272,7 +1272,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP;
err = mlx5_function_setup(dev, true);
err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err)
goto err_function;
......@@ -1336,9 +1336,10 @@ void mlx5_uninit_one(struct mlx5_core_dev *dev)
mutex_unlock(&dev->intf_state_mutex);
}
int mlx5_load_one(struct mlx5_core_dev *dev)
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery)
{
int err = 0;
u64 timeout;
mutex_lock(&dev->intf_state_mutex);
if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
......@@ -1348,7 +1349,11 @@ int mlx5_load_one(struct mlx5_core_dev *dev)
/* remove any previous indication of internal error */
dev->state = MLX5_DEVICE_STATE_UP;
err = mlx5_function_setup(dev, false);
if (recovery)
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
else
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
err = mlx5_function_setup(dev, timeout);
if (err)
goto err_function;
......@@ -1719,7 +1724,7 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
mlx5_pci_trace(dev, "Enter, loading driver..\n");
err = mlx5_load_one(dev);
err = mlx5_load_one(dev, false);
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
!err ? "recovered" : "Failed");
......@@ -1807,7 +1812,7 @@ static int mlx5_resume(struct pci_dev *pdev)
{
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
return mlx5_load_one(dev);
return mlx5_load_one(dev, false);
}
static const struct pci_device_id mlx5_core_pci_table[] = {
......@@ -1852,7 +1857,7 @@ int mlx5_recover_device(struct mlx5_core_dev *dev)
return -EIO;
}
return mlx5_load_one(dev);
return mlx5_load_one(dev, true);
}
static struct pci_driver mlx5_core_driver = {
......
......@@ -290,7 +290,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev);
int mlx5_init_one(struct mlx5_core_dev *dev);
void mlx5_uninit_one(struct mlx5_core_dev *dev);
void mlx5_unload_one(struct mlx5_core_dev *dev);
int mlx5_load_one(struct mlx5_core_dev *dev);
int mlx5_load_one(struct mlx5_core_dev *dev, bool recovery);
int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment