Commit 48f05f29 authored by Monk Liu's avatar Monk Liu Committed by Alex Deucher

amd/scheduler:imple job skip feature(v3)

jobs are skipped under two cases
1)when the entity behind this job marked guilty, the job
poped from this entity's queue will be dropped in sched_main loop.

2)in job_recovery(), skip the scheduling job if its karma detected
above limit, and also skipped as well for other jobs sharing the
same fence context. this approach is becuase job_recovery() cannot
access job->entity due to entity may already dead.

v2:
some logic fix

v3:
when entity detected guilty, don't drop the job in the poping
stage, instead set its fence error as -ECANCELED

in run_job(), skip the scheduling either:1) fence->error < 0
or 2) there was a VRAM LOST occurred on this job.
this way we can unify the job skipping logic.

with this feature we can introduce new gpu recover feature.
Signed-off-by: default avatarMonk Liu <Monk.Liu@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3a393cf9
...@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job, ...@@ -180,7 +180,7 @@ static struct dma_fence *amdgpu_job_dependency(struct amd_sched_job *sched_job,
static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
{ {
struct dma_fence *fence = NULL; struct dma_fence *fence = NULL, *finished;
struct amdgpu_device *adev; struct amdgpu_device *adev;
struct amdgpu_job *job; struct amdgpu_job *job;
int r; int r;
...@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job) ...@@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
return NULL; return NULL;
} }
job = to_amdgpu_job(sched_job); job = to_amdgpu_job(sched_job);
finished = &job->base.s_fence->finished;
adev = job->adev; adev = job->adev;
BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
trace_amdgpu_sched_run_job(job); trace_amdgpu_sched_run_job(job);
/* skip ib schedule when vram is lost */
if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) { if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED); dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
DRM_ERROR("Skip scheduling IBs!\n");
if (finished->error < 0) {
DRM_INFO("Skip scheduling IBs!\n");
} else { } else {
r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job, r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
&fence); &fence);
......
...@@ -345,6 +345,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity) ...@@ -345,6 +345,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
if (amd_sched_entity_add_dependency_cb(entity)) if (amd_sched_entity_add_dependency_cb(entity))
return NULL; return NULL;
/* skip jobs from entity that marked guilty */
if (entity->guilty && atomic_read(entity->guilty))
dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
spsc_queue_pop(&entity->job_queue); spsc_queue_pop(&entity->job_queue);
return sched_job; return sched_job;
} }
...@@ -441,14 +445,6 @@ static void amd_sched_job_timedout(struct work_struct *work) ...@@ -441,14 +445,6 @@ static void amd_sched_job_timedout(struct work_struct *work)
job->sched->ops->timedout_job(job); job->sched->ops->timedout_job(job);
} }
static void amd_sched_set_guilty(struct amd_sched_job *s_job,
struct amd_sched_entity *s_entity)
{
if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
if (s_entity->guilty)
atomic_set(s_entity->guilty, 1);
}
void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad) void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
{ {
struct amd_sched_job *s_job; struct amd_sched_job *s_job;
...@@ -468,21 +464,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo ...@@ -468,21 +464,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_jo
spin_unlock(&sched->job_list_lock); spin_unlock(&sched->job_list_lock);
if (bad) { if (bad) {
bool found = false; /* don't increase @bad's karma if it's from KERNEL RQ,
* becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) { * corrupt but keep in mind that kernel jobs always considered good.
*/
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
struct amd_sched_rq *rq = &sched->sched_rq[i]; struct amd_sched_rq *rq = &sched->sched_rq[i];
spin_lock(&rq->lock); spin_lock(&rq->lock);
list_for_each_entry_safe(entity, tmp, &rq->entities, list) { list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
if (bad->s_fence->scheduled.context == entity->fence_context) { if (bad->s_fence->scheduled.context == entity->fence_context) {
found = true; if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
amd_sched_set_guilty(bad, entity); if (entity->guilty)
atomic_set(entity->guilty, 1);
break; break;
} }
} }
spin_unlock(&rq->lock); spin_unlock(&rq->lock);
if (found) if (&entity->list != &rq->entities)
break; break;
} }
} }
...@@ -500,6 +499,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job) ...@@ -500,6 +499,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
{ {
struct amd_sched_job *s_job, *tmp; struct amd_sched_job *s_job, *tmp;
bool found_guilty = false;
int r; int r;
spin_lock(&sched->job_list_lock); spin_lock(&sched->job_list_lock);
...@@ -511,6 +511,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) ...@@ -511,6 +511,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) { list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
struct amd_sched_fence *s_fence = s_job->s_fence; struct amd_sched_fence *s_fence = s_job->s_fence;
struct dma_fence *fence; struct dma_fence *fence;
uint64_t guilty_context;
if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
found_guilty = true;
guilty_context = s_job->s_fence->scheduled.context;
}
if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
dma_fence_set_error(&s_fence->finished, -ECANCELED);
spin_unlock(&sched->job_list_lock); spin_unlock(&sched->job_list_lock);
fence = sched->ops->run_job(s_job); fence = sched->ops->run_job(s_job);
...@@ -526,7 +535,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) ...@@ -526,7 +535,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
r); r);
dma_fence_put(fence); dma_fence_put(fence);
} else { } else {
DRM_ERROR("Failed to run job!\n");
amd_sched_process_job(NULL, &s_fence->cb); amd_sched_process_job(NULL, &s_fence->cb);
} }
spin_lock(&sched->job_list_lock); spin_lock(&sched->job_list_lock);
...@@ -664,7 +672,6 @@ static int amd_sched_main(void *param) ...@@ -664,7 +672,6 @@ static int amd_sched_main(void *param)
r); r);
dma_fence_put(fence); dma_fence_put(fence);
} else { } else {
DRM_ERROR("Failed to run job!\n");
amd_sched_process_job(NULL, &s_fence->cb); amd_sched_process_job(NULL, &s_fence->cb);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment