Commit b75efe88 authored by Evan Quan's avatar Evan Quan Committed by Alex Deucher

drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

An intentional delay is added on soft ctf triggered. Then there will
be a double check for the GPU temperature before taking further
action. This can avoid unintended shutdown due to temperature
momentary fluctuation.
Signed-off-by: default avatarEvan Quan <evan.quan@amd.com>
Reviewed-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 064329c5
...@@ -286,6 +286,9 @@ extern int amdgpu_user_partt_mode; ...@@ -286,6 +286,9 @@ extern int amdgpu_user_partt_mode;
#define AMDGPU_SMARTSHIFT_MAX_BIAS (100) #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
#define AMDGPU_SMARTSHIFT_MIN_BIAS (-100) #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
#define AMDGPU_SWCTF_EXTRA_DELAY 50
struct amdgpu_xcp_mgr; struct amdgpu_xcp_mgr;
struct amdgpu_device; struct amdgpu_device;
struct amdgpu_irq_src; struct amdgpu_irq_src;
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/firmware.h> #include <linux/firmware.h>
#include <linux/reboot.h>
#include "amd_shared.h" #include "amd_shared.h"
#include "amd_powerplay.h" #include "amd_powerplay.h"
#include "power_state.h" #include "power_state.h"
...@@ -91,6 +92,45 @@ static int pp_early_init(void *handle) ...@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
return 0; return 0;
} }
static void pp_swctf_delayed_work_handler(struct work_struct *work)
{
struct pp_hwmgr *hwmgr =
container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
struct amdgpu_device *adev = hwmgr->adev;
struct amdgpu_dpm_thermal *range =
&adev->pm.dpm.thermal;
uint32_t gpu_temperature, size;
int ret;
/*
* If the hotspot/edge temperature is confirmed as below SW CTF setting point
* after the delay enforced, nothing will be done.
* Otherwise, a graceful shutdown will be performed to prevent further damage.
*/
if (range->sw_ctf_threshold &&
hwmgr->hwmgr_func->read_sensor) {
ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
&gpu_temperature,
&size);
/*
* For some legacy ASICs, hotspot temperature retrieving might be not
* supported. Check the edge temperature instead then.
*/
if (ret == -EOPNOTSUPP)
ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
AMDGPU_PP_SENSOR_EDGE_TEMP,
&gpu_temperature,
&size);
if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
return;
}
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
orderly_poweroff(true);
}
static int pp_sw_init(void *handle) static int pp_sw_init(void *handle)
{ {
struct amdgpu_device *adev = handle; struct amdgpu_device *adev = handle;
...@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle) ...@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully"); pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
if (!ret)
INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
pp_swctf_delayed_work_handler);
return ret; return ret;
} }
...@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle) ...@@ -135,6 +179,8 @@ static int pp_hw_fini(void *handle)
struct amdgpu_device *adev = handle; struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
hwmgr_hw_fini(hwmgr); hwmgr_hw_fini(hwmgr);
return 0; return 0;
...@@ -221,6 +267,8 @@ static int pp_suspend(void *handle) ...@@ -221,6 +267,8 @@ static int pp_suspend(void *handle)
struct amdgpu_device *adev = handle; struct amdgpu_device *adev = handle;
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle; struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
return hwmgr_suspend(hwmgr); return hwmgr_suspend(hwmgr);
} }
......
...@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev, ...@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,
struct amdgpu_irq_src *source, struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
uint32_t client_id = entry->client_id; uint32_t client_id = entry->client_id;
uint32_t src_id = entry->src_id; uint32_t src_id = entry->src_id;
if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) { if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) { if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); schedule_delayed_work(&hwmgr->swctf_delayed_work,
/* msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
* SW CTF just occurred. } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
* Try to do a graceful shutdown to prevent further damage.
*/
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
orderly_poweroff(true);
} else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n"); dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
/* /*
* HW CTF just occurred. Shutdown to prevent further damage. * HW CTF just occurred. Shutdown to prevent further damage.
...@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev, ...@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,
orderly_poweroff(true); orderly_poweroff(true);
} }
} else if (client_id == SOC15_IH_CLIENTID_THM) { } else if (client_id == SOC15_IH_CLIENTID_THM) {
if (src_id == 0) { if (src_id == 0)
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); schedule_delayed_work(&hwmgr->swctf_delayed_work,
/* msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
* SW CTF just occurred. else
* Try to do a graceful shutdown to prevent further damage.
*/
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
orderly_poweroff(true);
} else
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n"); dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
} else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) { } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n"); dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
......
...@@ -811,6 +811,8 @@ struct pp_hwmgr { ...@@ -811,6 +811,8 @@ struct pp_hwmgr {
bool gfxoff_state_changed_by_workload; bool gfxoff_state_changed_by_workload;
uint32_t pstate_sclk_peak; uint32_t pstate_sclk_peak;
uint32_t pstate_mclk_peak; uint32_t pstate_mclk_peak;
struct delayed_work swctf_delayed_work;
}; };
int hwmgr_early_init(struct pp_hwmgr *hwmgr); int hwmgr_early_init(struct pp_hwmgr *hwmgr);
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/firmware.h> #include <linux/firmware.h>
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/reboot.h>
#include "amdgpu.h" #include "amdgpu.h"
#include "amdgpu_smu.h" #include "amdgpu_smu.h"
...@@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work) ...@@ -1078,6 +1079,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
smu->ppt_funcs->interrupt_work(smu); smu->ppt_funcs->interrupt_work(smu);
} }
static void smu_swctf_delayed_work_handler(struct work_struct *work)
{
struct smu_context *smu =
container_of(work, struct smu_context, swctf_delayed_work.work);
struct smu_temperature_range *range =
&smu->thermal_range;
struct amdgpu_device *adev = smu->adev;
uint32_t hotspot_tmp, size;
/*
* If the hotspot temperature is confirmed as below SW CTF setting point
* after the delay enforced, nothing will be done.
* Otherwise, a graceful shutdown will be performed to prevent further damage.
*/
if (range->software_shutdown_temp &&
smu->ppt_funcs->read_sensor &&
!smu->ppt_funcs->read_sensor(smu,
AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
&hotspot_tmp,
&size) &&
hotspot_tmp / 1000 < range->software_shutdown_temp)
return;
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
orderly_poweroff(true);
}
static int smu_sw_init(void *handle) static int smu_sw_init(void *handle)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct amdgpu_device *adev = (struct amdgpu_device *)handle;
...@@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle) ...@@ -1120,6 +1149,9 @@ static int smu_sw_init(void *handle)
smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO; smu->smu_dpm.dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO; smu->smu_dpm.requested_dpm_level = AMD_DPM_FORCED_LEVEL_AUTO;
INIT_DELAYED_WORK(&smu->swctf_delayed_work,
smu_swctf_delayed_work_handler);
ret = smu_smc_table_sw_init(smu); ret = smu_smc_table_sw_init(smu);
if (ret) { if (ret) {
dev_err(adev->dev, "Failed to sw init smc table!\n"); dev_err(adev->dev, "Failed to sw init smc table!\n");
...@@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu) ...@@ -1600,6 +1632,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
return ret; return ret;
} }
cancel_delayed_work_sync(&smu->swctf_delayed_work);
ret = smu_disable_dpms(smu); ret = smu_disable_dpms(smu);
if (ret) { if (ret) {
dev_err(adev->dev, "Fail to disable dpm features!\n"); dev_err(adev->dev, "Fail to disable dpm features!\n");
......
...@@ -573,6 +573,8 @@ struct smu_context ...@@ -573,6 +573,8 @@ struct smu_context
u32 debug_param_reg; u32 debug_param_reg;
u32 debug_msg_reg; u32 debug_msg_reg;
u32 debug_resp_reg; u32 debug_resp_reg;
struct delayed_work swctf_delayed_work;
}; };
struct i2c_adapter; struct i2c_adapter;
......
...@@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, ...@@ -1412,13 +1412,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
if (client_id == SOC15_IH_CLIENTID_THM) { if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) { switch (src_id) {
case THM_11_0__SRCID__THM_DIG_THERM_L2H: case THM_11_0__SRCID__THM_DIG_THERM_L2H:
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); schedule_delayed_work(&smu->swctf_delayed_work,
/* msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
* SW CTF just occurred.
* Try to do a graceful shutdown to prevent further damage.
*/
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
orderly_poweroff(true);
break; break;
case THM_11_0__SRCID__THM_DIG_THERM_H2L: case THM_11_0__SRCID__THM_DIG_THERM_H2L:
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
......
...@@ -1353,13 +1353,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev, ...@@ -1353,13 +1353,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
if (client_id == SOC15_IH_CLIENTID_THM) { if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) { switch (src_id) {
case THM_11_0__SRCID__THM_DIG_THERM_L2H: case THM_11_0__SRCID__THM_DIG_THERM_L2H:
dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n"); schedule_delayed_work(&smu->swctf_delayed_work,
/* msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
* SW CTF just occurred.
* Try to do a graceful shutdown to prevent further damage.
*/
dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
orderly_poweroff(true);
break; break;
case THM_11_0__SRCID__THM_DIG_THERM_H2L: case THM_11_0__SRCID__THM_DIG_THERM_H2L:
dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n"); dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment