Commit 60b7dcca authored by Tomer Tayar's avatar Tomer Tayar Committed by Greg Kroah-Hartman

habanalabs: Dissociate RAZWI info from event types

This patch provides a workaround for a H/W bug in the RAZWI logger in
Goya. The logger doesn't recognize the initiator correctly and as a
result, accesses from one initiator are reported that were coming from a
different initiator.

The WA is to print the error information from the event entries we receive
without looking at the RAZWI logger at all.
Signed-off-by: default avatarTomer Tayar <ttayar@habana.ai>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent f699f9f9
...@@ -111,29 +111,6 @@ static u16 goya_packet_sizes[MAX_PACKET_ID] = { ...@@ -111,29 +111,6 @@ static u16 goya_packet_sizes[MAX_PACKET_ID] = {
[PACKET_STOP] = sizeof(struct packet_stop) [PACKET_STOP] = sizeof(struct packet_stop)
}; };
static const char *goya_axi_name[GOYA_MAX_INITIATORS] = {
"MME0",
"MME1",
"MME2",
"MME3",
"MME4",
"MME5",
"TPC0",
"TPC1",
"TPC2",
"TPC3",
"TPC4",
"TPC5",
"TPC6",
"TPC7",
"PCI",
"DMA", /* HBW */
"DMA", /* LBW */
"PSOC",
"CPU",
"MMU"
};
static u64 goya_mmu_regs[GOYA_MMU_REGS_NUM] = { static u64 goya_mmu_regs[GOYA_MMU_REGS_NUM] = {
mmDMA_QM_0_GLBL_NON_SECURE_PROPS, mmDMA_QM_0_GLBL_NON_SECURE_PROPS,
mmDMA_QM_1_GLBL_NON_SECURE_PROPS, mmDMA_QM_1_GLBL_NON_SECURE_PROPS,
...@@ -4554,109 +4531,159 @@ static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val) ...@@ -4554,109 +4531,159 @@ static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val)
(addr - goya->ddr_bar_cur_addr)); (addr - goya->ddr_bar_cur_addr));
} }
static void goya_get_axi_name(struct hl_device *hdev, u32 agent_id, static const char *_goya_get_event_desc(u16 event_type)
u16 event_type, char *axi_name, int len)
{ {
if (!strcmp(goya_axi_name[agent_id], "DMA")) switch (event_type) {
if (event_type >= GOYA_ASYNC_EVENT_ID_DMA0_CH) case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
snprintf(axi_name, len, "DMA %d", return "PCIe_dec";
event_type - GOYA_ASYNC_EVENT_ID_DMA0_CH); case GOYA_ASYNC_EVENT_ID_TPC0_DEC:
else case GOYA_ASYNC_EVENT_ID_TPC1_DEC:
snprintf(axi_name, len, "DMA %d", case GOYA_ASYNC_EVENT_ID_TPC2_DEC:
event_type - GOYA_ASYNC_EVENT_ID_DMA0_QM); case GOYA_ASYNC_EVENT_ID_TPC3_DEC:
else case GOYA_ASYNC_EVENT_ID_TPC4_DEC:
snprintf(axi_name, len, "%s", goya_axi_name[agent_id]); case GOYA_ASYNC_EVENT_ID_TPC5_DEC:
case GOYA_ASYNC_EVENT_ID_TPC6_DEC:
case GOYA_ASYNC_EVENT_ID_TPC7_DEC:
return "TPC%d_dec";
case GOYA_ASYNC_EVENT_ID_MME_WACS:
return "MME_wacs";
case GOYA_ASYNC_EVENT_ID_MME_WACSD:
return "MME_wacsd";
case GOYA_ASYNC_EVENT_ID_CPU_AXI_SPLITTER:
return "CPU_axi_splitter";
case GOYA_ASYNC_EVENT_ID_PSOC_AXI_DEC:
return "PSOC_axi_dec";
case GOYA_ASYNC_EVENT_ID_PSOC:
return "PSOC";
case GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR:
case GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR:
return "TPC%d_krn_err";
case GOYA_ASYNC_EVENT_ID_TPC0_CMDQ ... GOYA_ASYNC_EVENT_ID_TPC7_CMDQ:
return "TPC%d_cq";
case GOYA_ASYNC_EVENT_ID_TPC0_QM ... GOYA_ASYNC_EVENT_ID_TPC7_QM:
return "TPC%d_qm";
case GOYA_ASYNC_EVENT_ID_MME_QM:
return "MME_qm";
case GOYA_ASYNC_EVENT_ID_MME_CMDQ:
return "MME_cq";
case GOYA_ASYNC_EVENT_ID_DMA0_QM ... GOYA_ASYNC_EVENT_ID_DMA4_QM:
return "DMA%d_qm";
case GOYA_ASYNC_EVENT_ID_DMA0_CH ... GOYA_ASYNC_EVENT_ID_DMA4_CH:
return "DMA%d_ch";
default:
return "N/A";
}
} }
static void goya_print_razwi_info(struct hl_device *hdev, u64 reg, static void goya_get_event_desc(u16 event_type, char *desc, size_t size)
bool is_hbw, bool is_read, u16 event_type)
{ {
u32 val, agent_id; u8 index;
char axi_name[10] = {0};
val = RREG32(reg);
if (is_hbw) switch (event_type) {
agent_id = (val & GOYA_IRQ_HBW_AGENT_ID_MASK) >> case GOYA_ASYNC_EVENT_ID_TPC0_DEC:
GOYA_IRQ_HBW_AGENT_ID_SHIFT; case GOYA_ASYNC_EVENT_ID_TPC1_DEC:
else case GOYA_ASYNC_EVENT_ID_TPC2_DEC:
agent_id = (val & GOYA_IRQ_LBW_AGENT_ID_MASK) >> case GOYA_ASYNC_EVENT_ID_TPC3_DEC:
GOYA_IRQ_LBW_AGENT_ID_SHIFT; case GOYA_ASYNC_EVENT_ID_TPC4_DEC:
case GOYA_ASYNC_EVENT_ID_TPC5_DEC:
if (agent_id >= GOYA_MAX_INITIATORS) { case GOYA_ASYNC_EVENT_ID_TPC6_DEC:
dev_err(hdev->dev, case GOYA_ASYNC_EVENT_ID_TPC7_DEC:
"Illegal %s %s with wrong initiator id %d, H/W IRQ %d\n", index = (event_type - GOYA_ASYNC_EVENT_ID_TPC0_DEC) / 3;
is_read ? "read from" : "write to", snprintf(desc, size, _goya_get_event_desc(event_type), index);
is_hbw ? "HBW" : "LBW", break;
agent_id, case GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR:
event_type); case GOYA_ASYNC_EVENT_ID_TPC1_KRN_ERR:
} else { case GOYA_ASYNC_EVENT_ID_TPC2_KRN_ERR:
goya_get_axi_name(hdev, agent_id, event_type, axi_name, case GOYA_ASYNC_EVENT_ID_TPC3_KRN_ERR:
sizeof(axi_name)); case GOYA_ASYNC_EVENT_ID_TPC4_KRN_ERR:
dev_err(hdev->dev, "Illegal %s by %s %s %s, H/W IRQ %d\n", case GOYA_ASYNC_EVENT_ID_TPC5_KRN_ERR:
is_read ? "read" : "write", case GOYA_ASYNC_EVENT_ID_TPC6_KRN_ERR:
axi_name, case GOYA_ASYNC_EVENT_ID_TPC7_KRN_ERR:
is_read ? "from" : "to", index = (event_type - GOYA_ASYNC_EVENT_ID_TPC0_KRN_ERR) / 10;
is_hbw ? "HBW" : "LBW", snprintf(desc, size, _goya_get_event_desc(event_type), index);
event_type); break;
case GOYA_ASYNC_EVENT_ID_TPC0_CMDQ ... GOYA_ASYNC_EVENT_ID_TPC7_CMDQ:
index = event_type - GOYA_ASYNC_EVENT_ID_TPC0_CMDQ;
snprintf(desc, size, _goya_get_event_desc(event_type), index);
break;
case GOYA_ASYNC_EVENT_ID_TPC0_QM ... GOYA_ASYNC_EVENT_ID_TPC7_QM:
index = event_type - GOYA_ASYNC_EVENT_ID_TPC0_QM;
snprintf(desc, size, _goya_get_event_desc(event_type), index);
break;
case GOYA_ASYNC_EVENT_ID_DMA0_QM ... GOYA_ASYNC_EVENT_ID_DMA4_QM:
index = event_type - GOYA_ASYNC_EVENT_ID_DMA0_QM;
snprintf(desc, size, _goya_get_event_desc(event_type), index);
break;
case GOYA_ASYNC_EVENT_ID_DMA0_CH ... GOYA_ASYNC_EVENT_ID_DMA4_CH:
index = event_type - GOYA_ASYNC_EVENT_ID_DMA0_CH;
snprintf(desc, size, _goya_get_event_desc(event_type), index);
break;
default:
snprintf(desc, size, _goya_get_event_desc(event_type));
break;
} }
} }
static void goya_print_irq_info(struct hl_device *hdev, u16 event_type) static void goya_print_razwi_info(struct hl_device *hdev)
{ {
struct goya_device *goya = hdev->asic_specific;
bool is_hbw = false, is_read = false, is_info = false;
if (RREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD)) { if (RREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD)) {
goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_LBW_WT_ID, is_hbw, dev_err(hdev->dev, "Illegal write to LBW\n");
is_read, event_type);
WREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD, 0); WREG32(mmDMA_MACRO_RAZWI_LBW_WT_VLD, 0);
is_info = true;
} }
if (RREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD)) { if (RREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD)) {
is_read = true; dev_err(hdev->dev, "Illegal read from LBW\n");
goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_LBW_RD_ID, is_hbw,
is_read, event_type);
WREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD, 0); WREG32(mmDMA_MACRO_RAZWI_LBW_RD_VLD, 0);
is_info = true;
} }
if (RREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD)) { if (RREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD)) {
is_hbw = true; dev_err(hdev->dev, "Illegal write to HBW\n");
goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_HBW_WT_ID, is_hbw,
is_read, event_type);
WREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD, 0); WREG32(mmDMA_MACRO_RAZWI_HBW_WT_VLD, 0);
is_info = true;
} }
if (RREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD)) { if (RREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD)) {
is_hbw = true; dev_err(hdev->dev, "Illegal read from HBW\n");
is_read = true;
goya_print_razwi_info(hdev, mmDMA_MACRO_RAZWI_HBW_RD_ID, is_hbw,
is_read, event_type);
WREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD, 0); WREG32(mmDMA_MACRO_RAZWI_HBW_RD_VLD, 0);
is_info = true;
}
if (!is_info) {
dev_err(hdev->dev,
"Received H/W interrupt %d, no additional info\n",
event_type);
return;
} }
}
if (goya->hw_cap_initialized & HW_CAP_MMU) { static void goya_print_mmu_error_info(struct hl_device *hdev)
u32 val = RREG32(mmMMU_PAGE_ERROR_CAPTURE); {
struct goya_device *goya = hdev->asic_specific;
u64 addr; u64 addr;
u32 val;
if (!(goya->hw_cap_initialized & HW_CAP_MMU))
return;
val = RREG32(mmMMU_PAGE_ERROR_CAPTURE);
if (val & MMU_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) { if (val & MMU_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) {
addr = val & MMU_PAGE_ERROR_CAPTURE_VA_49_32_MASK; addr = val & MMU_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
addr <<= 32; addr <<= 32;
addr |= RREG32(mmMMU_PAGE_ERROR_CAPTURE_VA); addr |= RREG32(mmMMU_PAGE_ERROR_CAPTURE_VA);
dev_err(hdev->dev, "MMU page fault on va 0x%llx\n", dev_err(hdev->dev, "MMU page fault on va 0x%llx\n", addr);
addr);
WREG32(mmMMU_PAGE_ERROR_CAPTURE, 0); WREG32(mmMMU_PAGE_ERROR_CAPTURE, 0);
} }
} }
static void goya_print_irq_info(struct hl_device *hdev, u16 event_type)
{
char desc[20] = "";
goya_get_event_desc(event_type, desc, sizeof(desc));
dev_err(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
goya_print_razwi_info(hdev);
goya_print_mmu_error_info(hdev);
} }
static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr, static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment