Commit b0106bc6 authored by Tomer Tayar's avatar Tomer Tayar Committed by Oded Gabbay

habanalabs: add an option to delay a device reset

Several H/W events can be sent adjacently, even due to a single error.
If a hard-reset is triggered as part of handling one of these events,
the following events won't be handled.
The debug info from these missed events is important, sometimes even
more important than the one that was handled.

To allow handling these close events, add an option to delay a device
reset and use it when resetting due to H/W events.
Signed-off-by: default avatarTomer Tayar <ttayar@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 9c27896a
...@@ -13,6 +13,8 @@ ...@@ -13,6 +13,8 @@
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/hwmon.h> #include <linux/hwmon.h>
#define HL_RESET_DELAY_USEC 10000 /* 10ms */
enum hl_device_status hl_device_status(struct hl_device *hdev) enum hl_device_status hl_device_status(struct hl_device *hdev)
{ {
enum hl_device_status status; enum hl_device_status status;
...@@ -980,7 +982,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -980,7 +982,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
{ {
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false, bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
reset_upon_device_release = false, schedule_hard_reset = false, reset_upon_device_release = false, schedule_hard_reset = false,
skip_wq_flush = false; skip_wq_flush, delay_reset;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx; struct hl_ctx *ctx;
int i, rc; int i, rc;
...@@ -994,6 +996,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -994,6 +996,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR); from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW); fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE); skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE);
delay_reset = !!(flags & HL_DRV_RESET_DELAY);
if (!hard_reset && !hdev->asic_prop.supports_soft_reset) { if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
hard_instead_soft = true; hard_instead_soft = true;
...@@ -1043,6 +1046,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -1043,6 +1046,9 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->reset_info.in_reset = 1; hdev->reset_info.in_reset = 1;
spin_unlock(&hdev->reset_info.lock); spin_unlock(&hdev->reset_info.lock);
if (delay_reset)
usleep_range(HL_RESET_DELAY_USEC, HL_RESET_DELAY_USEC << 1);
handle_reset_trigger(hdev, flags); handle_reset_trigger(hdev, flags);
/* This still allows the completion of some KDMA ops */ /* This still allows the completion of some KDMA ops */
......
...@@ -142,6 +142,9 @@ enum hl_mmu_page_table_location { ...@@ -142,6 +142,9 @@ enum hl_mmu_page_table_location {
* *
* - HL_DRV_RESET_FW_FATAL_ERR * - HL_DRV_RESET_FW_FATAL_ERR
* Set if reset is due to a fatal error from FW * Set if reset is due to a fatal error from FW
*
* - HL_DRV_RESET_DELAY
* Set if a delay should be added before the reset
*/ */
#define HL_DRV_RESET_HARD (1 << 0) #define HL_DRV_RESET_HARD (1 << 0)
...@@ -151,6 +154,7 @@ enum hl_mmu_page_table_location { ...@@ -151,6 +154,7 @@ enum hl_mmu_page_table_location {
#define HL_DRV_RESET_DEV_RELEASE (1 << 4) #define HL_DRV_RESET_DEV_RELEASE (1 << 4)
#define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5) #define HL_DRV_RESET_BYPASS_REQ_TO_FW (1 << 5)
#define HL_DRV_RESET_FW_FATAL_ERR (1 << 6) #define HL_DRV_RESET_FW_FATAL_ERR (1 << 6)
#define HL_DRV_RESET_DELAY (1 << 7)
#define HL_MAX_SOBS_PER_MONITOR 8 #define HL_MAX_SOBS_PER_MONITOR 8
......
...@@ -8199,7 +8199,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -8199,7 +8199,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
| HL_DRV_RESET_BYPASS_REQ_TO_FW | HL_DRV_RESET_BYPASS_REQ_TO_FW
| fw_fatal_err_flag); | fw_fatal_err_flag);
else if (hdev->hard_reset_on_fw_events) else if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag); hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_DELAY | fw_fatal_err_flag);
else else
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment