Commit 542cff78 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky

drm/sched: Avoid lockdep spalt on killing a processes

Probelm:
Singlaning one sched fence from within another's sched
fence singal callback generates lockdep splat because
the both have same lockdep class of their fence->lock

Fix:
Fix bellow stack by rescheduling to irq work of
signaling and killing of jobs that left when entity is killed.

[11176.741181]  dump_stack+0x10/0x12
[11176.741186] __lock_acquire.cold+0x208/0x2df
[11176.741197]  lock_acquire+0xc6/0x2d0
[11176.741204]  ? dma_fence_signal+0x28/0x80
[11176.741212] _raw_spin_lock_irqsave+0x4d/0x70
[11176.741219]  ? dma_fence_signal+0x28/0x80
[11176.741225]  dma_fence_signal+0x28/0x80
[11176.741230] drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741240] drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched]
[11176.741248] dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741254]  dma_fence_signal+0x3b/0x80
[11176.741260] drm_sched_fence_finished+0x12/0x20 [gpu_sched]
[11176.741268] drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched]
[11176.741277] drm_sched_job_done_cb+0x12/0x20 [gpu_sched]
[11176.741284] dma_fence_signal_timestamp_locked+0xac/0x1a0
[11176.741290]  dma_fence_signal+0x3b/0x80
[11176.741296] amdgpu_fence_process+0xd1/0x140 [amdgpu]
[11176.741504] sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu]
[11176.741731]  amdgpu_irq_dispatch+0xce/0x250 [amdgpu]
[11176.741954]  amdgpu_ih_process+0x81/0x100 [amdgpu]
[11176.742174]  amdgpu_irq_handler+0x26/0xa0 [amdgpu]
[11176.742393] __handle_irq_event_percpu+0x4f/0x2c0
[11176.742402] handle_irq_event_percpu+0x33/0x80
[11176.742408]  handle_irq_event+0x39/0x60
[11176.742414]  handle_edge_irq+0x93/0x1d0
[11176.742419]  __common_interrupt+0x50/0xe0
[11176.742426]  common_interrupt+0x80/0x90
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: default avatarDaniel Vetter  <daniel.vetter@ffwll.ch>
Suggested-by: default avatarChristian König <christian.koenig@amd.com>
Tested-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Link: https://www.spinics.net/lists/dri-devel/msg321250.html
parent f99413e4
......@@ -190,6 +190,16 @@ long drm_sched_entity_flush(struct drm_sched_entity *entity, long timeout)
}
EXPORT_SYMBOL(drm_sched_entity_flush);
static void drm_sched_entity_kill_jobs_irq_work(struct irq_work *wrk)
{
struct drm_sched_job *job = container_of(wrk, typeof(*job), work);
drm_sched_fence_finished(job->s_fence);
WARN_ON(job->s_fence->parent);
job->sched->ops->free_job(job);
}
/* Signal the scheduler finished fence when the entity in question is killed. */
static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
struct dma_fence_cb *cb)
......@@ -197,9 +207,8 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f,
struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
finish_cb);
drm_sched_fence_finished(job->s_fence);
WARN_ON(job->s_fence->parent);
job->sched->ops->free_job(job);
init_irq_work(&job->work, drm_sched_entity_kill_jobs_irq_work);
irq_work_queue(&job->work);
}
static struct dma_fence *
......
......@@ -28,6 +28,7 @@
#include <linux/dma-fence.h>
#include <linux/completion.h>
#include <linux/xarray.h>
#include <linux/irq_work.h>
#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
......@@ -286,7 +287,16 @@ struct drm_sched_job {
struct list_head list;
struct drm_gpu_scheduler *sched;
struct drm_sched_fence *s_fence;
struct dma_fence_cb finish_cb;
/*
* work is used only after finish_cb has been used and will not be
* accessed anymore.
*/
union {
struct dma_fence_cb finish_cb;
struct irq_work work;
};
uint64_t id;
atomic_t karma;
enum drm_sched_priority s_priority;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment