Commit c9a6b82f authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher

drm/amdgpu: Implement DPC recovery

Add PCI Downstream Port Containment (DPC) with
basic recovery functionality

v2: remove pci_save_state to avoid breaking suspend/resume
v3: Fix style comments
v4: Improve description.
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2a9787dc
...@@ -49,6 +49,8 @@ ...@@ -49,6 +49,8 @@
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/hashtable.h> #include <linux/hashtable.h>
#include <linux/dma-fence.h> #include <linux/dma-fence.h>
#include <linux/pci.h>
#include <linux/aer.h>
#include <drm/ttm/ttm_bo_api.h> #include <drm/ttm/ttm_bo_api.h>
#include <drm/ttm/ttm_bo_driver.h> #include <drm/ttm/ttm_bo_driver.h>
...@@ -1260,6 +1262,12 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return ...@@ -1260,6 +1262,12 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return
void amdgpu_register_gpu_instance(struct amdgpu_device *adev); void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev); void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
pci_channel_state_t state);
pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
void amdgpu_pci_resume(struct pci_dev *pdev);
#include "amdgpu_object.h" #include "amdgpu_object.h"
/* used by df_v3_6.c and amdgpu_pmu.c */ /* used by df_v3_6.c and amdgpu_pmu.c */
......
...@@ -2999,6 +2999,7 @@ static const struct attribute *amdgpu_dev_attributes[] = { ...@@ -2999,6 +2999,7 @@ static const struct attribute *amdgpu_dev_attributes[] = {
NULL NULL
}; };
/** /**
* amdgpu_device_init - initialize the driver * amdgpu_device_init - initialize the driver
* *
...@@ -3217,6 +3218,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -3217,6 +3218,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
} }
} }
pci_enable_pcie_error_reporting(adev->ddev.pdev);
/* Post card if necessary */ /* Post card if necessary */
if (amdgpu_device_need_post(adev)) { if (amdgpu_device_need_post(adev)) {
if (!adev->bios) { if (!adev->bios) {
...@@ -4705,3 +4708,161 @@ int amdgpu_device_baco_exit(struct drm_device *dev) ...@@ -4705,3 +4708,161 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
return 0; return 0;
} }
/**
* amdgpu_pci_error_detected - Called when a PCI error is detected.
* @pdev: PCI device struct
* @state: PCI channel state
*
* Description: Called when a PCI error is detected.
*
* Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
*/
pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
/* Fatal error, prepare for slot reset */
amdgpu_device_lock_adev(adev);
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
return PCI_ERS_RESULT_DISCONNECT;
}
return PCI_ERS_RESULT_NEED_RESET;
}
/**
* amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
* @pdev: pointer to PCI device
*/
pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
{
DRM_INFO("PCI error: mmio enabled callback!!\n");
/* TODO - dump whatever for debugging purposes */
/* This called only if amdgpu_pci_error_detected returns
* PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
* works, no need to reset slot.
*/
return PCI_ERS_RESULT_RECOVERED;
}
/**
* amdgpu_pci_slot_reset - Called when PCI slot has been reset.
* @pdev: PCI device struct
*
* Description: This routine is called by the pci error recovery
* code after the PCI slot has been reset, just before we
* should resume normal operations.
*/
pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
int r;
bool vram_lost;
DRM_INFO("PCI error: slot reset callback!!\n");
pci_restore_state(pdev);
r = amdgpu_device_ip_suspend(adev);
if (r)
goto out;
/* post card */
r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
if (r)
goto out;
r = amdgpu_device_ip_resume_phase1(adev);
if (r)
goto out;
vram_lost = amdgpu_device_check_vram_lost(adev);
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(adev);
}
r = amdgpu_gtt_mgr_recover(
&adev->mman.bdev.man[TTM_PL_TT]);
if (r)
goto out;
r = amdgpu_device_fw_loading(adev);
if (r)
return r;
r = amdgpu_device_ip_resume_phase2(adev);
if (r)
goto out;
if (vram_lost)
amdgpu_device_fill_reset_magic(adev);
/*
* Add this ASIC as tracked as reset was already
* complete successfully.
*/
amdgpu_register_gpu_instance(adev);
r = amdgpu_device_ip_late_init(adev);
if (r)
goto out;
amdgpu_fbdev_set_suspend(adev, 0);
/* must succeed. */
amdgpu_ras_resume(adev);
amdgpu_irq_gpu_reset_resume_helper(adev);
r = amdgpu_ib_ring_tests(adev);
if (r)
goto out;
r = amdgpu_device_recover_vram(adev);
out:
if (!r) {
DRM_INFO("PCIe error recovery succeeded\n");
} else {
DRM_ERROR("PCIe error recovery failed, err:%d", r);
amdgpu_device_unlock_adev(adev);
}
return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
}
/**
* amdgpu_pci_resume() - resume normal ops after PCI reset
* @pdev: pointer to PCI device
*
* Called when the error recovery driver tells us that its
* OK to resume normal operation. Use completion to allow
* halted scsi ops to resume.
*/
void amdgpu_pci_resume(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
amdgpu_device_unlock_adev(adev);
DRM_INFO("PCI error: resume callback!!\n");
}
...@@ -32,7 +32,6 @@ ...@@ -32,7 +32,6 @@
#include <drm/drm_pciids.h> #include <drm/drm_pciids.h>
#include <linux/console.h> #include <linux/console.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/pci.h>
#include <linux/pm_runtime.h> #include <linux/pm_runtime.h>
#include <linux/vga_switcheroo.h> #include <linux/vga_switcheroo.h>
#include <drm/drm_probe_helper.h> #include <drm/drm_probe_helper.h>
...@@ -1528,6 +1527,13 @@ static struct drm_driver kms_driver = { ...@@ -1528,6 +1527,13 @@ static struct drm_driver kms_driver = {
.patchlevel = KMS_DRIVER_PATCHLEVEL, .patchlevel = KMS_DRIVER_PATCHLEVEL,
}; };
static struct pci_error_handlers amdgpu_pci_err_handler = {
.error_detected = amdgpu_pci_error_detected,
.mmio_enabled = amdgpu_pci_mmio_enabled,
.slot_reset = amdgpu_pci_slot_reset,
.resume = amdgpu_pci_resume,
};
static struct pci_driver amdgpu_kms_pci_driver = { static struct pci_driver amdgpu_kms_pci_driver = {
.name = DRIVER_NAME, .name = DRIVER_NAME,
.id_table = pciidlist, .id_table = pciidlist,
...@@ -1535,6 +1541,7 @@ static struct pci_driver amdgpu_kms_pci_driver = { ...@@ -1535,6 +1541,7 @@ static struct pci_driver amdgpu_kms_pci_driver = {
.remove = amdgpu_pci_remove, .remove = amdgpu_pci_remove,
.shutdown = amdgpu_pci_shutdown, .shutdown = amdgpu_pci_shutdown,
.driver.pm = &amdgpu_pm_ops, .driver.pm = &amdgpu_pm_ops,
.err_handler = &amdgpu_pci_err_handler,
}; };
static int __init amdgpu_init(void) static int __init amdgpu_init(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment