Commit 84586de4 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

habanalabs: reset device upon FD close if not idle

If device is not idle after user closes the FD we must reset device
as next user that will try to open FD will encounter a non-functional
device.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 8e8125f1
......@@ -12,7 +12,6 @@
static void hl_ctx_fini(struct hl_ctx *ctx)
{
struct hl_device *hdev = ctx->hdev;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
int i;
/* Release all allocated pending cb's, those cb's were never
......@@ -57,14 +56,6 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
/* Scrub both SRAM and DRAM */
hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
if ((!hdev->pldm) && (hdev->pdev) &&
(!hdev->asic_funcs->is_device_idle(hdev,
idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)))
dev_notice(hdev->dev,
"device not idle after user context is closed (0x%llx, 0x%llx)\n",
idle_mask[0], idle_mask[1]);
} else {
dev_dbg(hdev->dev, "closing kernel context\n");
hdev->asic_funcs->ctx_fini(ctx);
......
......@@ -51,6 +51,8 @@ bool hl_device_operational(struct hl_device *hdev,
static void hpriv_release(struct kref *ref)
{
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
bool device_is_idle = true;
struct hl_fpriv *hpriv;
struct hl_device *hdev;
......@@ -71,7 +73,19 @@ static void hpriv_release(struct kref *ref)
kfree(hpriv);
if (hdev->reset_upon_device_release)
if ((!hdev->pldm) && (hdev->pdev) &&
(!hdev->asic_funcs->is_device_idle(hdev,
idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL))) {
dev_err(hdev->dev,
"device not idle after user context is closed (0x%llx_%llx)\n",
idle_mask[1], idle_mask[0]);
device_is_idle = false;
}
if ((hdev->reset_if_device_not_idle && !device_is_idle)
|| hdev->reset_upon_device_release)
hl_device_reset(hdev, 0);
}
......@@ -1108,8 +1122,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
dev_err(hdev->dev,
"device is not idle (mask %#llx %#llx) after reset\n",
idle_mask[0], idle_mask[1]);
"device is not idle (mask 0x%llx_%llx) after reset\n",
idle_mask[1], idle_mask[0]);
rc = -EIO;
goto out_err;
}
......
......@@ -2311,6 +2311,7 @@ struct hl_device {
u8 rl_enable;
u8 reset_on_preboot_fail;
u8 reset_upon_device_release;
u8 reset_if_device_not_idle;
};
......
......@@ -264,6 +264,7 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
hdev->bmc_enable = 1;
hdev->hard_reset_on_fw_events = 1;
hdev->reset_on_preboot_fail = 1;
hdev->reset_if_device_not_idle = 1;
hdev->reset_pcilink = 0;
hdev->axi_drain = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment