Commit b82e65a9 authored by Guchun Chen's avatar Guchun Chen Committed by Alex Deucher

drm/amdgpu: break driver init process when it's bad GPU(v5)

When retrieving bad gpu tag from eeprom, GPU init should
fail as the GPU needs to be retired for further check.

v2: Fix spelling typo, correct the condition to detect
    bad gpu tag and refine error message.

v3: Refine function argument name.

v4: Fix missing check of returning value of i2c
    initialization error case.

v5: Use dev_err to print PCI information in dmesg instead
    of DRM_ERROR.
Signed-off-by: default avatarGuchun Chen <guchun.chen@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 1d6a9d12
...@@ -2055,13 +2055,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) ...@@ -2055,13 +2055,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
* it should be called after amdgpu_device_ip_hw_init_phase2 since * it should be called after amdgpu_device_ip_hw_init_phase2 since
* for some ASICs the RAS EEPROM code relies on SMU fully functioning * for some ASICs the RAS EEPROM code relies on SMU fully functioning
* for I2C communication which only true at this point. * for I2C communication which only true at this point.
* recovery_init may fail, but it can free all resources allocated by *
* itself and its failure should not stop amdgpu init process. * amdgpu_ras_recovery_init may fail, but the upper only cares the
* failure from bad gpu situation and stop amdgpu init process
* accordingly. For other failed cases, it will still release all
* the resource and print error message, rather than returning one
* negative value to upper level.
* *
* Note: theoretically, this should be called before all vram allocations * Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing * to protect retired page from abusing
*/ */
amdgpu_ras_recovery_init(adev); r = amdgpu_ras_recovery_init(adev);
if (r)
goto init_failed;
if (adev->gmc.xgmi.num_physical_nodes > 1) if (adev->gmc.xgmi.num_physical_nodes > 1)
amdgpu_xgmi_add_device(adev); amdgpu_xgmi_add_device(adev);
......
...@@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1821,6 +1821,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data; struct ras_err_handler_data **data;
uint32_t max_eeprom_records_len = 0; uint32_t max_eeprom_records_len = 0;
bool exc_err_limit = false;
int ret; int ret;
if (con) if (con)
...@@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1842,8 +1843,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
ret = amdgpu_ras_eeprom_init(&con->eeprom_control); ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
if (ret) /*
* This calling fails when exc_err_limit is true or
* ret != 0.
*/
if (exc_err_limit || ret)
goto free; goto free;
if (con->eeprom_control.num_recs) { if (con->eeprom_control.num_recs) {
...@@ -1867,6 +1872,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1867,6 +1872,15 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
out: out:
dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); dev_warn(adev->dev, "Failed to initialize ras recovery!\n");
/*
* Except error threshold exceeding case, other failure cases in this
* function would not fail amdgpu driver init.
*/
if (!exc_err_limit)
ret = 0;
else
ret = -EINVAL;
return ret; return ret;
} }
......
...@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) ...@@ -241,7 +241,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
} }
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit)
{ {
int ret = 0; int ret = 0;
struct amdgpu_device *adev = to_amdgpu_device(control); struct amdgpu_device *adev = to_amdgpu_device(control);
...@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) ...@@ -254,6 +255,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
.buf = buff, .buf = buff,
}; };
*exceed_err_limit = false;
/* Verify i2c adapter is initialized */ /* Verify i2c adapter is initialized */
if (!adev->pm.smu_i2c.algo) if (!adev->pm.smu_i2c.algo)
return -ENOENT; return -ENOENT;
...@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) ...@@ -282,6 +285,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
control->num_recs); control->num_recs);
} else if ((hdr->header == EEPROM_TABLE_HDR_BAD) &&
(amdgpu_bad_page_threshold != 0)) {
*exceed_err_limit = true;
dev_err(adev->dev, "Exceeding the bad_page_threshold parameter, "
"disabling the GPU.\n");
} else { } else {
DRM_INFO("Creating new EEPROM table"); DRM_INFO("Creating new EEPROM table");
......
...@@ -76,7 +76,8 @@ struct eeprom_table_record { ...@@ -76,7 +76,8 @@ struct eeprom_table_record {
unsigned char mcumc_id; unsigned char mcumc_id;
}__attribute__((__packed__)); }__attribute__((__packed__));
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
bool *exceed_err_limit);
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment