Commit 8d9aa980 authored by Oded Gabbay's avatar Oded Gabbay

habanalabs: add support for f/w reset

When the f/w runs in secured mode, it can reset the ASIC when certain
events occur. In unsecured mode, the driver asks the f/w to reset the
ASIC for those events.

We need to perform the entire reset procedure but without accessing the
ASIC. i.e. without halting the engines and without sending messages
to the f/w.
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 56e753d5
...@@ -311,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work) ...@@ -311,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work)
container_of(work, struct hl_device_reset_work, container_of(work, struct hl_device_reset_work,
reset_work.work); reset_work.work);
struct hl_device *hdev = device_reset_work->hdev; struct hl_device *hdev = device_reset_work->hdev;
u32 flags;
int rc; int rc;
rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD); flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
if (device_reset_work->fw_reset)
flags |= HL_RESET_FW;
rc = hl_device_reset(hdev, flags);
if ((rc == -EBUSY) && !hdev->device_fini_pending) { if ((rc == -EBUSY) && !hdev->device_fini_pending) {
dev_info(hdev->dev, dev_info(hdev->dev,
"Could not reset device. will try again in %u seconds", "Could not reset device. will try again in %u seconds",
...@@ -702,7 +708,7 @@ static void take_release_locks(struct hl_device *hdev) ...@@ -702,7 +708,7 @@ static void take_release_locks(struct hl_device *hdev)
mutex_unlock(&hdev->fpriv_list_lock); mutex_unlock(&hdev->fpriv_list_lock);
} }
static void cleanup_resources(struct hl_device *hdev, bool hard_reset) static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{ {
if (hard_reset) if (hard_reset)
device_late_fini(hdev); device_late_fini(hdev);
...@@ -712,7 +718,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset) ...@@ -712,7 +718,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
* completions from H/W and we won't have any accesses from the * completions from H/W and we won't have any accesses from the
* H/W to the host machine * H/W to the host machine
*/ */
hdev->asic_funcs->halt_engines(hdev, hard_reset); hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
/* Go over all the queues, release all CS and their jobs */ /* Go over all the queues, release all CS and their jobs */
hl_cs_rollback_all(hdev); hl_cs_rollback_all(hdev);
...@@ -922,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev) ...@@ -922,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
int hl_device_reset(struct hl_device *hdev, u32 flags) int hl_device_reset(struct hl_device *hdev, u32 flags)
{ {
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0}; u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
bool hard_reset, from_hard_reset_thread, hard_instead_soft = false; bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
int i, rc; int i, rc;
if (!hdev->init_done) { if (!hdev->init_done) {
...@@ -933,6 +939,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -933,6 +939,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hard_reset = !!(flags & HL_RESET_HARD); hard_reset = !!(flags & HL_RESET_HARD);
from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD); from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
fw_reset = !!(flags & HL_RESET_FW);
if (!hard_reset && !hdev->supports_soft_reset) { if (!hard_reset && !hdev->supports_soft_reset) {
hard_instead_soft = true; hard_instead_soft = true;
...@@ -984,11 +991,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -984,11 +991,13 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
else else
hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN; hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
/* /* If reset is due to heartbeat, device CPU is no responsive in
* if reset is due to heartbeat, device CPU is no responsive in * which case no point sending PCI disable message to it.
* which case no point sending PCI disable message to it *
* If F/W is performing the reset, no need to send it a message to disable
* PCI access
*/ */
if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) { if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
/* Disable PCI access from device F/W so he won't send /* Disable PCI access from device F/W so he won't send
* us additional interrupts. We disable MSI/MSI-X at * us additional interrupts. We disable MSI/MSI-X at
* the halt_engines function and we can't have the F/W * the halt_engines function and we can't have the F/W
...@@ -1018,6 +1027,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -1018,6 +1027,8 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hdev->process_kill_trial_cnt = 0; hdev->process_kill_trial_cnt = 0;
hdev->device_reset_work.fw_reset = fw_reset;
/* /*
* Because the reset function can't run from heartbeat work, * Because the reset function can't run from heartbeat work,
* we need to call the reset function from a dedicated work. * we need to call the reset function from a dedicated work.
...@@ -1028,7 +1039,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -1028,7 +1039,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
return 0; return 0;
} }
cleanup_resources(hdev, hard_reset); cleanup_resources(hdev, hard_reset, fw_reset);
kill_processes: kill_processes:
if (hard_reset) { if (hard_reset) {
...@@ -1062,7 +1073,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags) ...@@ -1062,7 +1073,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
} }
/* Reset the H/W. It will be in idle state after this returns */ /* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, hard_reset); hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
if (hard_reset) { if (hard_reset) {
hdev->fw_loader.linux_loaded = false; hdev->fw_loader.linux_loaded = false;
...@@ -1587,7 +1598,7 @@ void hl_device_fini(struct hl_device *hdev) ...@@ -1587,7 +1598,7 @@ void hl_device_fini(struct hl_device *hdev)
hl_hwmon_fini(hdev); hl_hwmon_fini(hdev);
cleanup_resources(hdev, true); cleanup_resources(hdev, true, false);
/* Kill processes here after CS rollback. This is because the process /* Kill processes here after CS rollback. This is because the process
* can't really exit until all its CSs are done, which is what we * can't really exit until all its CSs are done, which is what we
...@@ -1606,7 +1617,7 @@ void hl_device_fini(struct hl_device *hdev) ...@@ -1606,7 +1617,7 @@ void hl_device_fini(struct hl_device *hdev)
hl_cb_pool_fini(hdev); hl_cb_pool_fini(hdev);
/* Reset the H/W. It will be in idle state after this returns */ /* Reset the H/W. It will be in idle state after this returns */
hdev->asic_funcs->hw_fini(hdev, true); hdev->asic_funcs->hw_fini(hdev, true, false);
hdev->fw_loader.linux_loaded = false; hdev->fw_loader.linux_loaded = false;
......
...@@ -128,12 +128,17 @@ enum hl_mmu_page_table_location { ...@@ -128,12 +128,17 @@ enum hl_mmu_page_table_location {
* *
* - HL_RESET_DEVICE_RELEASE * - HL_RESET_DEVICE_RELEASE
* Set if reset is due to device release * Set if reset is due to device release
*
* - HL_RESET_FW
* F/W will perform the reset. No need to ask it to reset the device. This is relevant
* only when running with secured f/w
*/ */
#define HL_RESET_HARD (1 << 0) #define HL_RESET_HARD (1 << 0)
#define HL_RESET_FROM_RESET_THREAD (1 << 1) #define HL_RESET_FROM_RESET_THREAD (1 << 1)
#define HL_RESET_HEARTBEAT (1 << 2) #define HL_RESET_HEARTBEAT (1 << 2)
#define HL_RESET_TDR (1 << 3) #define HL_RESET_TDR (1 << 3)
#define HL_RESET_DEVICE_RELEASE (1 << 4) #define HL_RESET_DEVICE_RELEASE (1 << 4)
#define HL_RESET_FW (1 << 5)
#define HL_MAX_SOBS_PER_MONITOR 8 #define HL_MAX_SOBS_PER_MONITOR 8
...@@ -1170,8 +1175,8 @@ struct hl_asic_funcs { ...@@ -1170,8 +1175,8 @@ struct hl_asic_funcs {
int (*sw_init)(struct hl_device *hdev); int (*sw_init)(struct hl_device *hdev);
int (*sw_fini)(struct hl_device *hdev); int (*sw_fini)(struct hl_device *hdev);
int (*hw_init)(struct hl_device *hdev); int (*hw_init)(struct hl_device *hdev);
void (*hw_fini)(struct hl_device *hdev, bool hard_reset); void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
void (*halt_engines)(struct hl_device *hdev, bool hard_reset); void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
int (*suspend)(struct hl_device *hdev); int (*suspend)(struct hl_device *hdev);
int (*resume)(struct hl_device *hdev); int (*resume)(struct hl_device *hdev);
int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma, int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
...@@ -2138,11 +2143,13 @@ struct hwmon_chip_info; ...@@ -2138,11 +2143,13 @@ struct hwmon_chip_info;
* @wq: work queue for device reset procedure. * @wq: work queue for device reset procedure.
* @reset_work: reset work to be done. * @reset_work: reset work to be done.
* @hdev: habanalabs device structure. * @hdev: habanalabs device structure.
* @fw_reset: whether f/w will do the reset without us sending them a message to do it.
*/ */
struct hl_device_reset_work { struct hl_device_reset_work {
struct workqueue_struct *wq; struct workqueue_struct *wq;
struct delayed_work reset_work; struct delayed_work reset_work;
struct hl_device *hdev; struct hl_device *hdev;
bool fw_reset;
}; };
/** /**
......
...@@ -535,7 +535,7 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state) ...@@ -535,7 +535,7 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
result = PCI_ERS_RESULT_NONE; result = PCI_ERS_RESULT_NONE;
} }
hdev->asic_funcs->halt_engines(hdev, true); hdev->asic_funcs->halt_engines(hdev, true, false);
return result; return result;
} }
......
...@@ -833,14 +833,14 @@ static int gaudi_early_init(struct hl_device *hdev) ...@@ -833,14 +833,14 @@ static int gaudi_early_init(struct hl_device *hdev)
GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC); GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
if (rc) { if (rc) {
if (hdev->reset_on_preboot_fail) if (hdev->reset_on_preboot_fail)
hdev->asic_funcs->hw_fini(hdev, true); hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini; goto pci_fini;
} }
if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_info(hdev->dev, dev_info(hdev->dev,
"H/W state is dirty, must reset before initializing\n"); "H/W state is dirty, must reset before initializing\n");
hdev->asic_funcs->hw_fini(hdev, true); hdev->asic_funcs->hw_fini(hdev, true, false);
} }
return 0; return 0;
...@@ -3836,7 +3836,7 @@ static void gaudi_disable_timestamp(struct hl_device *hdev) ...@@ -3836,7 +3836,7 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0); WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
} }
static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{ {
u32 wait_timeout_ms; u32 wait_timeout_ms;
...@@ -3848,6 +3848,9 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) ...@@ -3848,6 +3848,9 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
else else
wait_timeout_ms = GAUDI_RESET_WAIT_MSEC; wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;
if (fw_reset)
goto skip_engines;
gaudi_stop_nic_qmans(hdev); gaudi_stop_nic_qmans(hdev);
gaudi_stop_mme_qmans(hdev); gaudi_stop_mme_qmans(hdev);
gaudi_stop_tpc_qmans(hdev); gaudi_stop_tpc_qmans(hdev);
...@@ -3873,6 +3876,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) ...@@ -3873,6 +3876,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
gaudi_disable_timestamp(hdev); gaudi_disable_timestamp(hdev);
skip_engines:
gaudi_disable_msi(hdev); gaudi_disable_msi(hdev);
} }
...@@ -4240,7 +4244,7 @@ static int gaudi_hw_init(struct hl_device *hdev) ...@@ -4240,7 +4244,7 @@ static int gaudi_hw_init(struct hl_device *hdev)
return rc; return rc;
} }
static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{ {
struct cpu_dyn_regs *dyn_regs = struct cpu_dyn_regs *dyn_regs =
&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs; &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
...@@ -4261,6 +4265,14 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) ...@@ -4261,6 +4265,14 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC; cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
} }
if (fw_reset) {
dev_info(hdev->dev,
"Firmware performs HARD reset, going to wait %dms\n",
reset_timeout_ms);
goto skip_reset;
}
driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled && driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled &&
!hdev->asic_prop.hard_reset_done_by_fw); !hdev->asic_prop.hard_reset_done_by_fw);
...@@ -4337,6 +4349,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) ...@@ -4337,6 +4349,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
reset_timeout_ms); reset_timeout_ms);
} }
skip_reset:
/* /*
* After hard reset, we can't poll the BTM_FSM register because the PSOC * After hard reset, we can't poll the BTM_FSM register because the PSOC
* itself is in reset. Need to wait until the reset is deasserted * itself is in reset. Need to wait until the reset is deasserted
...@@ -7999,10 +8012,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -7999,10 +8012,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
tpc_dec_event_to_tpc_id(event_type), tpc_dec_event_to_tpc_id(event_type),
"AXI_SLV_DEC_Error"); "AXI_SLV_DEC_Error");
if (reset_required) { if (reset_required) {
dev_err(hdev->dev, "hard reset required due to %s\n", dev_err(hdev->dev, "reset required due to %s\n",
gaudi_irq_map_table[event_type].name); gaudi_irq_map_table[event_type].name);
goto reset_device; hl_device_reset(hdev, 0);
} else { } else {
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
} }
...@@ -8021,10 +8034,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -8021,10 +8034,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
tpc_krn_event_to_tpc_id(event_type), tpc_krn_event_to_tpc_id(event_type),
"KRN_ERR"); "KRN_ERR");
if (reset_required) { if (reset_required) {
dev_err(hdev->dev, "hard reset required due to %s\n", dev_err(hdev->dev, "reset required due to %s\n",
gaudi_irq_map_table[event_type].name); gaudi_irq_map_table[event_type].name);
goto reset_device; hl_device_reset(hdev, 0);
} else { } else {
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
} }
...@@ -8154,7 +8167,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev, ...@@ -8154,7 +8167,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
return; return;
reset_device: reset_device:
if (hdev->hard_reset_on_fw_events) if (hdev->asic_prop.fw_security_enabled)
hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
else if (hdev->hard_reset_on_fw_events)
hl_device_reset(hdev, HL_RESET_HARD); hl_device_reset(hdev, HL_RESET_HARD);
else else
hl_fw_unmask_irq(hdev, event_type); hl_fw_unmask_irq(hdev, event_type);
......
...@@ -654,14 +654,14 @@ static int goya_early_init(struct hl_device *hdev) ...@@ -654,14 +654,14 @@ static int goya_early_init(struct hl_device *hdev)
GOYA_BOOT_FIT_REQ_TIMEOUT_USEC); GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
if (rc) { if (rc) {
if (hdev->reset_on_preboot_fail) if (hdev->reset_on_preboot_fail)
hdev->asic_funcs->hw_fini(hdev, true); hdev->asic_funcs->hw_fini(hdev, true, false);
goto pci_fini; goto pci_fini;
} }
if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) { if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
dev_info(hdev->dev, dev_info(hdev->dev,
"H/W state is dirty, must reset before initializing\n"); "H/W state is dirty, must reset before initializing\n");
hdev->asic_funcs->hw_fini(hdev, true); hdev->asic_funcs->hw_fini(hdev, true, false);
} }
if (!hdev->pldm) { if (!hdev->pldm) {
...@@ -2380,7 +2380,7 @@ static void goya_disable_timestamp(struct hl_device *hdev) ...@@ -2380,7 +2380,7 @@ static void goya_disable_timestamp(struct hl_device *hdev)
WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0); WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
} }
static void goya_halt_engines(struct hl_device *hdev, bool hard_reset) static void goya_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
{ {
u32 wait_timeout_ms; u32 wait_timeout_ms;
...@@ -2703,14 +2703,7 @@ static int goya_hw_init(struct hl_device *hdev) ...@@ -2703,14 +2703,7 @@ static int goya_hw_init(struct hl_device *hdev)
return rc; return rc;
} }
/* static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
* goya_hw_fini - Goya hardware tear-down code
*
* @hdev: pointer to hl_device structure
* @hard_reset: should we do hard reset to all engines or just reset the
* compute/dma engines
*/
static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
{ {
struct goya_device *goya = hdev->asic_specific; struct goya_device *goya = hdev->asic_specific;
u32 reset_timeout_ms, cpu_timeout_ms, status; u32 reset_timeout_ms, cpu_timeout_ms, status;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment