Commit c84d4670 authored by Guchun Chen's avatar Guchun Chen Committed by Alex Deucher

drm/amdgpu: validate bad page threshold in ras(v3)

Bad page threshold value should be valid in the range between
-1 and max records length of eeprom. It could determine when
saved bad pages exceed threshold value, and proceed corresponding
actions.

v2: When using the default typical value, it should be min
value between typical value and eeprom max records length.

v3: drop the case of setting bad_page_cnt_threshold to be
    0xFFFFFFFF, as it confuses user.
Signed-off-by: default avatarGuchun Chen <guchun.chen@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent acc0204c
......@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
/* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL)
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
......@@ -1699,6 +1702,47 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
return ret;
}
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
uint32_t max_length)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int tmp_threshold = amdgpu_bad_page_threshold;
u64 val;
/*
* Justification of value bad_page_cnt_threshold in ras structure
*
* Generally, -1 <= amdgpu_bad_page_threshold <= max record length
* in eeprom, and introduce two scenarios accordingly.
*
* Bad page retirement enablement:
* - If amdgpu_bad_page_threshold = -1,
* bad_page_cnt_threshold = typical value by formula.
*
* - When the value from user is 0 < amdgpu_bad_page_threshold <
* max record length in eeprom, use it directly.
*
* Bad page retirement disablement:
* - If amdgpu_bad_page_threshold = 0, bad page retirement
* functionality is disabled, and bad_page_cnt_threshold will
* take no effect.
*/
if (tmp_threshold < -1)
tmp_threshold = -1;
else if (tmp_threshold > max_length)
tmp_threshold = max_length;
if (tmp_threshold == -1) {
val = adev->gmc.mc_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_length);
} else {
con->bad_page_cnt_threshold = tmp_threshold;
}
}
/* called in gpu recovery/init */
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
{
......@@ -1776,6 +1820,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0;
int ret;
if (con)
......@@ -1794,6 +1839,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
if (ret)
goto free;
......
......@@ -336,6 +336,9 @@ struct amdgpu_ras {
struct amdgpu_ras_eeprom_control eeprom_control;
bool error_query_ready;
/* bad page count threshold */
uint32_t bad_page_cnt_threshold;
};
struct ras_fs_data {
......
......@@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
return ret == num ? 0 : -EIO;
}
inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
{
return EEPROM_MAX_RECORD_NUM;
}
/* Used for testing if bugs encountered */
#if 0
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
......
......@@ -84,6 +84,8 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
bool write,
int num);
inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
#endif // _AMDGPU_RAS_EEPROM_H
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment