Commit 5740682e authored by Monk Liu's avatar Monk Liu Committed by Alex Deucher

drm/amdgpu:implement new GPU recover(v3)

1,new imple names amdgpu_gpu_recover which gives more hint
on what it does compared with gpu_reset

2,gpu_recover unify bare-metal and SR-IOV, only the asic reset
part is implemented differently

3,gpu_recover will increase hang job karma and mark its entity/context
as guilty if exceeds limit

V2:

4,in scheduler main routine the job from guilty context  will be immedialy
fake signaled after it poped from queue and its fence be set with
"-ECANCELED" error

5,in scheduler recovery routine all jobs from the guilty entity would be
dropped

6,in run_job() routine the real IB submission would be skipped if @skip parameter
equales true or there was VRAM lost occured.

V3:

7,replace deprecated gpu reset, use new gpu recover
Signed-off-by: default avatarMonk Liu <Monk.Liu@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 48f05f29
......@@ -178,6 +178,10 @@ extern int amdgpu_cik_support;
#define CIK_CURSOR_WIDTH 128
#define CIK_CURSOR_HEIGHT 128
/* GPU RESET flags */
#define AMDGPU_RESET_INFO_VRAM_LOST (1 << 0)
#define AMDGPU_RESET_INFO_FULLRESET (1 << 1)
struct amdgpu_device;
struct amdgpu_ib;
struct amdgpu_cs_parser;
......@@ -1833,7 +1837,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
#define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i))
/* Common functions */
int amdgpu_gpu_reset(struct amdgpu_device *adev);
int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job);
bool amdgpu_need_backup(struct amdgpu_device *adev);
void amdgpu_pci_config_reset(struct amdgpu_device *adev);
bool amdgpu_need_post(struct amdgpu_device *adev);
......
This diff is collapsed.
......@@ -694,25 +694,25 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
}
/**
* amdgpu_debugfs_gpu_reset - manually trigger a gpu reset
* amdgpu_debugfs_gpu_recover - manually trigger a gpu reset & recover
*
* Manually trigger a gpu reset at the next fence wait.
*/
static int amdgpu_debugfs_gpu_reset(struct seq_file *m, void *data)
static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data)
{
struct drm_info_node *node = (struct drm_info_node *) m->private;
struct drm_device *dev = node->minor->dev;
struct amdgpu_device *adev = dev->dev_private;
seq_printf(m, "gpu reset\n");
amdgpu_gpu_reset(adev);
seq_printf(m, "gpu recover\n");
amdgpu_gpu_recover(adev, NULL);
return 0;
}
static const struct drm_info_list amdgpu_debugfs_fence_list[] = {
{"amdgpu_fence_info", &amdgpu_debugfs_fence_info, 0, NULL},
{"amdgpu_gpu_reset", &amdgpu_debugfs_gpu_reset, 0, NULL}
{"amdgpu_gpu_recover", &amdgpu_debugfs_gpu_recover, 0, NULL}
};
static const struct drm_info_list amdgpu_debugfs_fence_list_sriov[] = {
......
......@@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work)
reset_work);
if (!amdgpu_sriov_vf(adev))
amdgpu_gpu_reset(adev);
amdgpu_gpu_recover(adev, NULL);
}
/* Disable *all* interrupts */
......
......@@ -37,10 +37,7 @@ static void amdgpu_job_timedout(struct amd_sched_job *s_job)
atomic_read(&job->ring->fence_drv.last_seq),
job->ring->fence_drv.sync_seq);
if (amdgpu_sriov_vf(job->adev))
amdgpu_sriov_gpu_reset(job->adev, job);
else
amdgpu_gpu_reset(job->adev);
amdgpu_gpu_recover(job->adev, job);
}
int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
......
......@@ -288,7 +288,6 @@ int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);
int amdgpu_virt_release_full_gpu(struct amdgpu_device *adev, bool init);
int amdgpu_virt_reset_gpu(struct amdgpu_device *adev);
int amdgpu_virt_wait_reset(struct amdgpu_device *adev);
int amdgpu_sriov_gpu_reset(struct amdgpu_device *adev, struct amdgpu_job *job);
int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev);
void amdgpu_virt_free_mm_table(struct amdgpu_device *adev);
int amdgpu_virt_fw_reserve_get_checksum(void *obj, unsigned long obj_size,
......
......@@ -254,7 +254,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
}
/* Trigger recovery due to world switch failure */
amdgpu_sriov_gpu_reset(adev, NULL);
amdgpu_gpu_recover(adev, NULL);
}
static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev,
......
......@@ -519,7 +519,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
}
/* Trigger recovery due to world switch failure */
amdgpu_sriov_gpu_reset(adev, NULL);
amdgpu_gpu_recover(adev, NULL);
}
static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment