Commit 145b03eb authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: create mmhub ras framework

enable mmhub ras feature and create sysfs/debugfs node for mmhub
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9fb2d8de
...@@ -177,7 +177,8 @@ struct amdgpu_gmc { ...@@ -177,7 +177,8 @@ struct amdgpu_gmc {
struct amdgpu_xgmi xgmi; struct amdgpu_xgmi xgmi;
struct amdgpu_irq_src ecc_irq; struct amdgpu_irq_src ecc_irq;
struct ras_common_if *ras_if; struct ras_common_if *umc_ras_if;
struct ras_common_if *mmhub_ras_if;
}; };
#define amdgpu_gmc_flush_gpu_tlb(adev, vmid, type) (adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (type)) #define amdgpu_gmc_flush_gpu_tlb(adev, vmid, type) (adev)->gmc.gmc_funcs->flush_gpu_tlb((adev), (vmid), (type))
......
...@@ -266,7 +266,7 @@ static int gmc_v9_0_process_ecc_irq(struct amdgpu_device *adev, ...@@ -266,7 +266,7 @@ static int gmc_v9_0_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
struct ras_common_if *ras_if = adev->gmc.ras_if; struct ras_common_if *ras_if = adev->gmc.umc_ras_if;
struct ras_dispatch_if ih_data = { struct ras_dispatch_if ih_data = {
.entry = entry, .entry = entry,
}; };
...@@ -740,27 +740,25 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev) ...@@ -740,27 +740,25 @@ static int gmc_v9_0_allocate_vm_inv_eng(struct amdgpu_device *adev)
return 0; return 0;
} }
static int gmc_v9_0_ecc_late_init(void *handle) static int gmc_v9_0_ecc_ras_block_late_init(void *handle,
struct ras_fs_if *fs_info, struct ras_common_if *ras_block)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
struct ras_common_if **ras_if = &adev->gmc.ras_if; struct ras_common_if **ras_if = NULL;
struct ras_ih_if ih_info = { struct ras_ih_if ih_info = {
.cb = gmc_v9_0_process_ras_data_cb, .cb = gmc_v9_0_process_ras_data_cb,
}; };
struct ras_fs_if fs_info = {
.sysfs_name = "umc_err_count",
.debugfs_name = "umc_err_inject",
};
struct ras_common_if ras_block = {
.block = AMDGPU_RAS_BLOCK__UMC,
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
.sub_block_index = 0,
.name = "umc",
};
int r; int r;
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC)) { if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
amdgpu_ras_feature_enable_on_boot(adev, &ras_block, 0); ras_if = &adev->gmc.umc_ras_if;
else if (ras_block->block == AMDGPU_RAS_BLOCK__MMHUB)
ras_if = &adev->gmc.mmhub_ras_if;
else
BUG();
if (!amdgpu_ras_is_supported(adev, ras_block->block)) {
amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
return 0; return 0;
} }
...@@ -775,7 +773,7 @@ static int gmc_v9_0_ecc_late_init(void *handle) ...@@ -775,7 +773,7 @@ static int gmc_v9_0_ecc_late_init(void *handle)
if (r == -EAGAIN) { if (r == -EAGAIN) {
/* request a gpu reset. will run again. */ /* request a gpu reset. will run again. */
amdgpu_ras_request_reset_on_boot(adev, amdgpu_ras_request_reset_on_boot(adev,
AMDGPU_RAS_BLOCK__UMC); ras_block->block);
return 0; return 0;
} }
/* fail to enable ras, cleanup all. */ /* fail to enable ras, cleanup all. */
...@@ -789,40 +787,45 @@ static int gmc_v9_0_ecc_late_init(void *handle) ...@@ -789,40 +787,45 @@ static int gmc_v9_0_ecc_late_init(void *handle)
if (!*ras_if) if (!*ras_if)
return -ENOMEM; return -ENOMEM;
**ras_if = ras_block; **ras_if = *ras_block;
r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1); r = amdgpu_ras_feature_enable_on_boot(adev, *ras_if, 1);
if (r) { if (r) {
if (r == -EAGAIN) { if (r == -EAGAIN) {
amdgpu_ras_request_reset_on_boot(adev, amdgpu_ras_request_reset_on_boot(adev,
AMDGPU_RAS_BLOCK__UMC); ras_block->block);
r = 0; r = 0;
} }
goto feature; goto feature;
} }
ih_info.head = **ras_if; ih_info.head = **ras_if;
fs_info.head = **ras_if; fs_info->head = **ras_if;
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
r = amdgpu_ras_interrupt_add_handler(adev, &ih_info); r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
if (r) if (r)
goto interrupt; goto interrupt;
}
amdgpu_ras_debugfs_create(adev, &fs_info); amdgpu_ras_debugfs_create(adev, fs_info);
r = amdgpu_ras_sysfs_create(adev, &fs_info); r = amdgpu_ras_sysfs_create(adev, fs_info);
if (r) if (r)
goto sysfs; goto sysfs;
resume: resume:
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC) {
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0); r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
if (r) if (r)
goto irq; goto irq;
}
return 0; return 0;
irq: irq:
amdgpu_ras_sysfs_remove(adev, *ras_if); amdgpu_ras_sysfs_remove(adev, *ras_if);
sysfs: sysfs:
amdgpu_ras_debugfs_remove(adev, *ras_if); amdgpu_ras_debugfs_remove(adev, *ras_if);
if (ras_block->block == AMDGPU_RAS_BLOCK__UMC)
amdgpu_ras_interrupt_remove_handler(adev, &ih_info); amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
interrupt: interrupt:
amdgpu_ras_feature_enable(adev, *ras_if, 0); amdgpu_ras_feature_enable(adev, *ras_if, 0);
...@@ -832,6 +835,40 @@ static int gmc_v9_0_ecc_late_init(void *handle) ...@@ -832,6 +835,40 @@ static int gmc_v9_0_ecc_late_init(void *handle)
return r; return r;
} }
static int gmc_v9_0_ecc_late_init(void *handle)
{
int r;
struct ras_fs_if umc_fs_info = {
.sysfs_name = "umc_err_count",
.debugfs_name = "umc_err_inject",
};
struct ras_common_if umc_ras_block = {
.block = AMDGPU_RAS_BLOCK__UMC,
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
.sub_block_index = 0,
.name = "umc",
};
struct ras_fs_if mmhub_fs_info = {
.sysfs_name = "mmhub_err_count",
.debugfs_name = "mmhub_err_inject",
};
struct ras_common_if mmhub_ras_block = {
.block = AMDGPU_RAS_BLOCK__MMHUB,
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
.sub_block_index = 0,
.name = "mmhub",
};
r = gmc_v9_0_ecc_ras_block_late_init(handle,
&umc_fs_info, &umc_ras_block);
if (r)
return r;
r = gmc_v9_0_ecc_ras_block_late_init(handle,
&mmhub_fs_info, &mmhub_ras_block);
return r;
}
static int gmc_v9_0_late_init(void *handle) static int gmc_v9_0_late_init(void *handle)
{ {
...@@ -1192,21 +1229,32 @@ static int gmc_v9_0_sw_fini(void *handle) ...@@ -1192,21 +1229,32 @@ static int gmc_v9_0_sw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) && if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
adev->gmc.ras_if) { adev->gmc.umc_ras_if) {
struct ras_common_if *ras_if = adev->gmc.ras_if; struct ras_common_if *ras_if = adev->gmc.umc_ras_if;
struct ras_ih_if ih_info = { struct ras_ih_if ih_info = {
.head = *ras_if, .head = *ras_if,
}; };
/*remove fs first*/ /* remove fs first */
amdgpu_ras_debugfs_remove(adev, ras_if); amdgpu_ras_debugfs_remove(adev, ras_if);
amdgpu_ras_sysfs_remove(adev, ras_if); amdgpu_ras_sysfs_remove(adev, ras_if);
/*remove the IH*/ /* remove the IH */
amdgpu_ras_interrupt_remove_handler(adev, &ih_info); amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
amdgpu_ras_feature_enable(adev, ras_if, 0); amdgpu_ras_feature_enable(adev, ras_if, 0);
kfree(ras_if); kfree(ras_if);
} }
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__MMHUB) &&
adev->gmc.mmhub_ras_if) {
struct ras_common_if *ras_if = adev->gmc.mmhub_ras_if;
/* remove fs and disable ras feature */
amdgpu_ras_debugfs_remove(adev, ras_if);
amdgpu_ras_sysfs_remove(adev, ras_if);
amdgpu_ras_feature_enable(adev, ras_if, 0);
kfree(ras_if);
}
amdgpu_gem_force_release(adev); amdgpu_gem_force_release(adev);
amdgpu_vm_manager_fini(adev); amdgpu_vm_manager_fini(adev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment