Commit 6e4be987 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: avoid ras error injection for retired page

check whether a page is bad page before umc error injection, bad page
should not be accessed again
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 4e930d96
No related merge requests found
...@@ -71,6 +71,9 @@ const char *ras_block_string[] = { ...@@ -71,6 +71,9 @@ const char *ras_block_string[] = {
atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr);
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
size_t size, loff_t *pos) size_t size, loff_t *pos)
{ {
...@@ -291,6 +294,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ...@@ -291,6 +294,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
break; break;
} }
/* umc ce/ue error injection for a bad page is not allowed */
if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
amdgpu_ras_check_bad_page(adev, data.inject.address)) {
DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n",
data.inject.address);
break;
}
/* data.inject.address is offset instead of absolute gpu address */ /* data.inject.address is offset instead of absolute gpu address */
ret = amdgpu_ras_error_inject(adev, &data.inject); ret = amdgpu_ras_error_inject(adev, &data.inject);
break; break;
...@@ -1431,6 +1442,39 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) ...@@ -1431,6 +1442,39 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
return ret; return ret;
} }
/*
* check if an address belongs to bad page
*
* Note: this check is only for umc block
*/
static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
uint64_t addr)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data;
int i;
bool ret = false;
if (!con || !con->eh_data)
return ret;
mutex_lock(&con->recovery_lock);
data = con->eh_data;
if (!data)
goto out;
addr >>= AMDGPU_GPU_PAGE_SHIFT;
for (i = 0; i < data->count; i++)
if (addr == data->bps[i].retired_page) {
ret = true;
goto out;
}
out:
mutex_unlock(&con->recovery_lock);
return ret;
}
/* called in gpu recovery/init */ /* called in gpu recovery/init */
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment