Commit c83c4171 authored by Oded Gabbay's avatar Oded Gabbay

habanalabs: halt device CPU only upon certain reset

Currently the driver halts the device CPU in the halt engines function,
which halts all the engines of the ASIC. The problem is that if later on we
stop the reset process (due to inability to clean memory mappings in time),
the CPU will remain in halt mode. This creates many issues, such as
thermal/power control and FLR handling.

Therefore, move the halting of the device CPU to the very end of the reset
process, just before writing to the registers to initiate the reset. In
addition, the driver now needs to send a message to the device F/W to
disable it from sending interrupts to the host machine because during halt
engines function the driver disables the MSI/MSI-X interrupts.
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: default avatarTomer Tayar <ttayar@habana.ai>
parent 9158c47e
...@@ -838,6 +838,22 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, ...@@ -838,6 +838,22 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
if (rc) if (rc)
return 0; return 0;
if (hard_reset) {
/* Disable PCI access from device F/W so he won't send
* us additional interrupts. We disable MSI/MSI-X at
* the halt_engines function and we can't have the F/W
* sending us interrupts after that. We need to disable
* the access here because if the device is marked
* disable, the message won't be send. Also, in case
* of heartbeat, the device CPU is marked as disable
* so this message won't be sent
*/
if (hl_fw_send_pci_access_msg(hdev,
ARMCP_PACKET_DISABLE_PCI_ACCESS))
dev_warn(hdev->dev,
"Failed to disable PCI access by F/W\n");
}
/* This also blocks future CS/VM/JOB completion operations */ /* This also blocks future CS/VM/JOB completion operations */
hdev->disabled = true; hdev->disabled = true;
......
...@@ -2578,27 +2578,16 @@ static void gaudi_disable_timestamp(struct hl_device *hdev) ...@@ -2578,27 +2578,16 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset) static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
{ {
u32 wait_timeout_ms, cpu_timeout_ms; u32 wait_timeout_ms;
dev_info(hdev->dev, dev_info(hdev->dev,
"Halting compute engines and disabling interrupts\n"); "Halting compute engines and disabling interrupts\n");
if (hdev->pldm) { if (hdev->pldm)
wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; wait_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC;
cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC; else
} else {
wait_timeout_ms = GAUDI_RESET_WAIT_MSEC; wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;
cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
}
/*
* I don't know what is the state of the CPU so make sure it is
* stopped in any means necessary
*/
WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE);
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GAUDI_EVENT_HALT_MACHINE);
msleep(cpu_timeout_ms);
gaudi_stop_mme_qmans(hdev); gaudi_stop_mme_qmans(hdev);
gaudi_stop_tpc_qmans(hdev); gaudi_stop_tpc_qmans(hdev);
...@@ -2966,17 +2955,34 @@ static int gaudi_hw_init(struct hl_device *hdev) ...@@ -2966,17 +2955,34 @@ static int gaudi_hw_init(struct hl_device *hdev)
static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset) static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
{ {
struct gaudi_device *gaudi = hdev->asic_specific; struct gaudi_device *gaudi = hdev->asic_specific;
u32 status, reset_timeout_ms, boot_strap = 0; u32 status, reset_timeout_ms, cpu_timeout_ms, boot_strap = 0;
if (!hard_reset) { if (!hard_reset) {
dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n"); dev_err(hdev->dev, "GAUDI doesn't support soft-reset\n");
return; return;
} }
if (hdev->pldm) if (hdev->pldm) {
reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC; reset_timeout_ms = GAUDI_PLDM_HRESET_TIMEOUT_MSEC;
else cpu_timeout_ms = GAUDI_PLDM_RESET_WAIT_MSEC;
} else {
reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC; reset_timeout_ms = GAUDI_RESET_TIMEOUT_MSEC;
cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
}
/* Set device to handle FLR by H/W as we will put the device CPU to
* halt mode
*/
WREG32(mmPCIE_AUX_FLR_CTRL, (PCIE_AUX_FLR_CTRL_HW_CTRL_MASK |
PCIE_AUX_FLR_CTRL_INT_MASK_MASK));
/* I don't know what is the state of the CPU so make sure it is
* stopped in any means necessary
*/
WREG32(mmPSOC_GLOBAL_CONF_KMD_MSG_TO_CPU, KMD_MSG_GOTO_WFE);
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, GAUDI_EVENT_HALT_MACHINE);
msleep(cpu_timeout_ms);
/* Tell ASIC not to re-initialize PCIe */ /* Tell ASIC not to re-initialize PCIe */
WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC); WREG32(mmPREBOOT_PCIE_EN, LKD_HARD_RESET_MAGIC);
......
...@@ -2240,29 +2240,15 @@ static void goya_disable_timestamp(struct hl_device *hdev) ...@@ -2240,29 +2240,15 @@ static void goya_disable_timestamp(struct hl_device *hdev)
static void goya_halt_engines(struct hl_device *hdev, bool hard_reset) static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
{ {
u32 wait_timeout_ms, cpu_timeout_ms; u32 wait_timeout_ms;
dev_info(hdev->dev, dev_info(hdev->dev,
"Halting compute engines and disabling interrupts\n"); "Halting compute engines and disabling interrupts\n");
if (hdev->pldm) { if (hdev->pldm)
wait_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; wait_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC;
cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC; else
} else {
wait_timeout_ms = GOYA_RESET_WAIT_MSEC; wait_timeout_ms = GOYA_RESET_WAIT_MSEC;
cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC;
}
if (hard_reset) {
/*
* I don't know what is the state of the CPU so make sure it is
* stopped in any means necessary
*/
WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE);
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GOYA_ASYNC_EVENT_ID_HALT_MACHINE);
msleep(cpu_timeout_ms);
}
goya_stop_external_queues(hdev); goya_stop_external_queues(hdev);
goya_stop_internal_queues(hdev); goya_stop_internal_queues(hdev);
...@@ -2567,14 +2553,26 @@ static int goya_hw_init(struct hl_device *hdev) ...@@ -2567,14 +2553,26 @@ static int goya_hw_init(struct hl_device *hdev)
static void goya_hw_fini(struct hl_device *hdev, bool hard_reset) static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
{ {
struct goya_device *goya = hdev->asic_specific; struct goya_device *goya = hdev->asic_specific;
u32 reset_timeout_ms, status; u32 reset_timeout_ms, cpu_timeout_ms, status;
if (hdev->pldm) if (hdev->pldm) {
reset_timeout_ms = GOYA_PLDM_RESET_TIMEOUT_MSEC; reset_timeout_ms = GOYA_PLDM_RESET_TIMEOUT_MSEC;
else cpu_timeout_ms = GOYA_PLDM_RESET_WAIT_MSEC;
} else {
reset_timeout_ms = GOYA_RESET_TIMEOUT_MSEC; reset_timeout_ms = GOYA_RESET_TIMEOUT_MSEC;
cpu_timeout_ms = GOYA_CPU_RESET_WAIT_MSEC;
}
if (hard_reset) { if (hard_reset) {
/* I don't know what is the state of the CPU so make sure it is
* stopped in any means necessary
*/
WREG32(mmPSOC_GLOBAL_CONF_UBOOT_MAGIC, KMD_MSG_GOTO_WFE);
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GOYA_ASYNC_EVENT_ID_HALT_MACHINE);
msleep(cpu_timeout_ms);
goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE); goya_set_ddr_bar_base(hdev, DRAM_PHYS_BASE);
goya_disable_clk_rlx(hdev); goya_disable_clk_rlx(hdev);
goya_set_pll_refclk(hdev); goya_set_pll_refclk(hdev);
......
...@@ -292,6 +292,7 @@ ...@@ -292,6 +292,7 @@
#define mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG 0xC02000 #define mmPCIE_DBI_DEVICE_ID_VENDOR_ID_REG 0xC02000
#define mmPCIE_AUX_FLR_CTRL 0xC07394
#define mmPCIE_AUX_DBI 0xC07490 #define mmPCIE_AUX_DBI 0xC07490
#endif /* ASIC_REG_GAUDI_REGS_H_ */ #endif /* ASIC_REG_GAUDI_REGS_H_ */
...@@ -455,4 +455,7 @@ enum axi_id { ...@@ -455,4 +455,7 @@ enum axi_id {
QM_ARB_ERR_MSG_EN_CHOISE_WDT_MASK |\ QM_ARB_ERR_MSG_EN_CHOISE_WDT_MASK |\
QM_ARB_ERR_MSG_EN_AXI_LBW_ERR_MASK) QM_ARB_ERR_MSG_EN_AXI_LBW_ERR_MASK)
#define PCIE_AUX_FLR_CTRL_HW_CTRL_MASK 0x1
#define PCIE_AUX_FLR_CTRL_INT_MASK_MASK 0x2
#endif /* GAUDI_MASKS_H_ */ #endif /* GAUDI_MASKS_H_ */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment