Commit bf36b52e authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher

drm/amdgpu: Avoid accessing HW when suspending SW state

At this point the ASIC is already post reset by the HW/PSP
so the HW not in proper state to be configured for suspension,
some blocks might be even gated and so best is to avoid touching it.

v2: Rename in_dpc to more meaningful name
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent c9a6b82f
...@@ -989,6 +989,7 @@ struct amdgpu_device { ...@@ -989,6 +989,7 @@ struct amdgpu_device {
atomic_t throttling_logging_enabled; atomic_t throttling_logging_enabled;
struct ratelimit_state throttling_logging_rs; struct ratelimit_state throttling_logging_rs;
uint32_t ras_features; uint32_t ras_features;
bool in_pci_err_recovery;
}; };
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
......
...@@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, ...@@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
{ {
uint32_t ret; uint32_t ret;
if (adev->in_pci_err_recovery)
return 0;
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_sem)) { down_read_trylock(&adev->reset_sem)) {
ret = amdgpu_kiq_rreg(adev, reg); ret = amdgpu_kiq_rreg(adev, reg);
...@@ -356,6 +359,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, ...@@ -356,6 +359,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
* Returns the 8 bit value from the offset specified. * Returns the 8 bit value from the offset specified.
*/ */
uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
if (adev->in_pci_err_recovery)
return 0;
if (offset < adev->rmmio_size) if (offset < adev->rmmio_size)
return (readb(adev->rmmio + offset)); return (readb(adev->rmmio + offset));
BUG(); BUG();
...@@ -377,6 +383,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { ...@@ -377,6 +383,9 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
* Writes the value specified to the offset specified. * Writes the value specified to the offset specified.
*/ */
void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
if (adev->in_pci_err_recovery)
return;
if (offset < adev->rmmio_size) if (offset < adev->rmmio_size)
writeb(value, adev->rmmio + offset); writeb(value, adev->rmmio + offset);
else else
...@@ -387,6 +396,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, ...@@ -387,6 +396,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
uint32_t reg, uint32_t v, uint32_t reg, uint32_t v,
uint32_t acc_flags) uint32_t acc_flags)
{ {
if (adev->in_pci_err_recovery)
return;
trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
if ((reg * 4) < adev->rmmio_size) if ((reg * 4) < adev->rmmio_size)
...@@ -414,6 +426,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, ...@@ -414,6 +426,9 @@ static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags) uint32_t acc_flags)
{ {
if (adev->in_pci_err_recovery)
return;
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) && if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
down_read_trylock(&adev->reset_sem)) { down_read_trylock(&adev->reset_sem)) {
amdgpu_kiq_wreg(adev, reg, v); amdgpu_kiq_wreg(adev, reg, v);
...@@ -432,6 +447,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, ...@@ -432,6 +447,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
uint32_t acc_flags) uint32_t acc_flags)
{ {
if (adev->in_pci_err_recovery)
return;
if (amdgpu_sriov_fullaccess(adev) && if (amdgpu_sriov_fullaccess(adev) &&
adev->gfx.rlc.funcs && adev->gfx.rlc.funcs &&
adev->gfx.rlc.funcs->is_rlcg_access_range) { adev->gfx.rlc.funcs->is_rlcg_access_range) {
...@@ -453,6 +471,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t ...@@ -453,6 +471,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t
*/ */
u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
{ {
if (adev->in_pci_err_recovery)
return 0;
if ((reg * 4) < adev->rio_mem_size) if ((reg * 4) < adev->rio_mem_size)
return ioread32(adev->rio_mem + (reg * 4)); return ioread32(adev->rio_mem + (reg * 4));
else { else {
...@@ -472,6 +493,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) ...@@ -472,6 +493,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
*/ */
void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
{ {
if (adev->in_pci_err_recovery)
return;
if ((reg * 4) < adev->rio_mem_size) if ((reg * 4) < adev->rio_mem_size)
iowrite32(v, adev->rio_mem + (reg * 4)); iowrite32(v, adev->rio_mem + (reg * 4));
else { else {
...@@ -491,6 +515,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) ...@@ -491,6 +515,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
*/ */
u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
{ {
if (adev->in_pci_err_recovery)
return 0;
if (index < adev->doorbell.num_doorbells) { if (index < adev->doorbell.num_doorbells) {
return readl(adev->doorbell.ptr + index); return readl(adev->doorbell.ptr + index);
} else { } else {
...@@ -511,6 +538,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) ...@@ -511,6 +538,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
*/ */
void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
{ {
if (adev->in_pci_err_recovery)
return;
if (index < adev->doorbell.num_doorbells) { if (index < adev->doorbell.num_doorbells) {
writel(v, adev->doorbell.ptr + index); writel(v, adev->doorbell.ptr + index);
} else { } else {
...@@ -529,6 +559,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) ...@@ -529,6 +559,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
*/ */
u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
{ {
if (adev->in_pci_err_recovery)
return 0;
if (index < adev->doorbell.num_doorbells) { if (index < adev->doorbell.num_doorbells) {
return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
} else { } else {
...@@ -549,6 +582,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) ...@@ -549,6 +582,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
*/ */
void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
{ {
if (adev->in_pci_err_recovery)
return;
if (index < adev->doorbell.num_doorbells) { if (index < adev->doorbell.num_doorbells) {
atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
} else { } else {
...@@ -4778,7 +4814,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) ...@@ -4778,7 +4814,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
pci_restore_state(pdev); pci_restore_state(pdev);
adev->in_pci_err_recovery = true;
r = amdgpu_device_ip_suspend(adev); r = amdgpu_device_ip_suspend(adev);
adev->in_pci_err_recovery = false;
if (r) if (r)
goto out; goto out;
......
...@@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) ...@@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
struct amdgpu_kiq *kiq = &adev->gfx.kiq; struct amdgpu_kiq *kiq = &adev->gfx.kiq;
struct amdgpu_ring *ring = &kiq->ring; struct amdgpu_ring *ring = &kiq->ring;
if (adev->in_pci_err_recovery)
return 0;
BUG_ON(!ring->funcs->emit_rreg); BUG_ON(!ring->funcs->emit_rreg);
spin_lock_irqsave(&kiq->ring_lock, flags); spin_lock_irqsave(&kiq->ring_lock, flags);
...@@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) ...@@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
BUG_ON(!ring->funcs->emit_wreg); BUG_ON(!ring->funcs->emit_wreg);
if (adev->in_pci_err_recovery)
return;
spin_lock_irqsave(&kiq->ring_lock, flags); spin_lock_irqsave(&kiq->ring_lock, flags);
amdgpu_ring_alloc(ring, 32); amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_wreg(ring, reg, v); amdgpu_ring_emit_wreg(ring, reg, v);
......
...@@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, ...@@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
int i; int i;
struct amdgpu_device *adev = psp->adev; struct amdgpu_device *adev = psp->adev;
if (psp->adev->in_pci_err_recovery)
return 0;
for (i = 0; i < adev->usec_timeout; i++) { for (i = 0; i < adev->usec_timeout; i++) {
val = RREG32(reg_index); val = RREG32(reg_index);
if (check_changed) { if (check_changed) {
...@@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp, ...@@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp,
bool ras_intr = false; bool ras_intr = false;
bool skip_unsupport = false; bool skip_unsupport = false;
if (psp->adev->in_pci_err_recovery)
return 0;
mutex_lock(&psp->mutex); mutex_lock(&psp->mutex);
memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
......
...@@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle) ...@@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle)
amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
if (!adev->in_pci_err_recovery) {
#ifndef BRING_UP_DEBUG #ifndef BRING_UP_DEBUG
if (amdgpu_async_gfx_ring) { if (amdgpu_async_gfx_ring) {
r = gfx_v10_0_kiq_disable_kgq(adev); r = gfx_v10_0_kiq_disable_kgq(adev);
if (r) if (r)
DRM_ERROR("KGQ disable failed\n"); DRM_ERROR("KGQ disable failed\n");
} }
#endif #endif
if (amdgpu_gfx_disable_kcq(adev)) if (amdgpu_gfx_disable_kcq(adev))
DRM_ERROR("KCQ disable failed\n"); DRM_ERROR("KCQ disable failed\n");
}
if (amdgpu_sriov_vf(adev)) { if (amdgpu_sriov_vf(adev)) {
gfx_v10_0_cp_gfx_enable(adev, false); gfx_v10_0_cp_gfx_enable(adev, false);
/* Program KIQ position of RLC_CP_SCHEDULERS during destroy */ /* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
......
...@@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, ...@@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
struct amdgpu_device *adev = smu->adev; struct amdgpu_device *adev = smu->adev;
int ret = 0, index = 0; int ret = 0, index = 0;
if (smu->adev->in_pci_err_recovery)
return 0;
index = smu_cmn_to_asic_specific_index(smu, index = smu_cmn_to_asic_specific_index(smu,
CMN2ASIC_MAPPING_MSG, CMN2ASIC_MAPPING_MSG,
msg); msg);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment