Commit a78422e9 authored by Danilo Krummrich's avatar Danilo Krummrich

drm/sched: implement dynamic job-flow control

Currently, job flow control is implemented simply by limiting the number
of jobs in flight. Therefore, a scheduler is initialized with a credit
limit that corresponds to the number of jobs which can be sent to the
hardware.

This implies that for each job, drivers need to account for the maximum
job size possible in order to not overflow the ring buffer.

However, there are drivers, such as Nouveau, where the job size has a
rather large range. For such drivers it can easily happen that job
submissions not even filling the ring by 1% can block subsequent
submissions, which, in the worst case, can lead to the ring run dry.

In order to overcome this issue, allow for tracking the actual job size
instead of the number of jobs. Therefore, add a field to track a job's
credit count, which represents the number of credits a job contributes
to the scheduler's credit limit.
Signed-off-by: default avatarDanilo Krummrich <dakr@redhat.com>
Reviewed-by: default avatarLuben Tuikov <ltuikov89@gmail.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20231110001638.71750-1-dakr@redhat.com
parent 36245bd0
......@@ -552,6 +552,12 @@ Overview
.. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c
:doc: Overview
Flow Control
------------
.. kernel-doc:: drivers/gpu/drm/scheduler/sched_main.c
:doc: Flow Control
Scheduler Function References
-----------------------------
......
......@@ -115,7 +115,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (!entity)
return 0;
return drm_sched_job_init(&(*job)->base, entity, owner);
return drm_sched_job_init(&(*job)->base, entity, 1, owner);
}
int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev,
......
......@@ -535,7 +535,7 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
ret = drm_sched_job_init(&submit->sched_job,
&ctx->sched_entity[args->pipe],
submit->ctx);
1, submit->ctx);
if (ret)
goto err_submit_put;
......
......@@ -1917,7 +1917,7 @@ static int etnaviv_gpu_rpm_suspend(struct device *dev)
u32 idle, mask;
/* If there are any jobs in the HW queue, we're not idle */
if (atomic_read(&gpu->sched.hw_rq_count))
if (atomic_read(&gpu->sched.credit_count))
return -EBUSY;
/* Check whether the hardware (except FE and MC) is idle */
......
......@@ -514,7 +514,7 @@ int lima_device_suspend(struct device *dev)
/* check any task running */
for (i = 0; i < lima_pipe_num; i++) {
if (atomic_read(&ldev->pipe[i].base.hw_rq_count))
if (atomic_read(&ldev->pipe[i].base.credit_count))
return -EBUSY;
}
......
......@@ -123,7 +123,7 @@ int lima_sched_task_init(struct lima_sched_task *task,
for (i = 0; i < num_bos; i++)
drm_gem_object_get(&bos[i]->base.base);
err = drm_sched_job_init(&task->base, &context->base, vm);
err = drm_sched_job_init(&task->base, &context->base, 1, vm);
if (err) {
kfree(task->bos);
return err;
......
......@@ -48,7 +48,7 @@ static struct msm_gem_submit *submit_create(struct drm_device *dev,
return ERR_PTR(ret);
}
ret = drm_sched_job_init(&submit->base, queue->entity, queue);
ret = drm_sched_job_init(&submit->base, queue->entity, 1, queue);
if (ret) {
kfree(submit->hw_fence);
kfree(submit);
......
......@@ -89,7 +89,7 @@ nouveau_job_init(struct nouveau_job *job,
}
ret = drm_sched_job_init(&job->base, &entity->base, NULL);
ret = drm_sched_job_init(&job->base, &entity->base, 1, NULL);
if (ret)
goto err_free_chains;
......
......@@ -274,7 +274,7 @@ static int panfrost_ioctl_submit(struct drm_device *dev, void *data,
ret = drm_sched_job_init(&job->base,
&file_priv->sched_entity[slot],
NULL);
1, NULL);
if (ret)
goto out_put_job;
......
......@@ -963,7 +963,7 @@ int panfrost_job_is_idle(struct panfrost_device *pfdev)
for (i = 0; i < NUM_JOB_SLOTS; i++) {
/* If there are any jobs in the HW queue, we're not idle */
if (atomic_read(&js->queue[i].sched.hw_rq_count))
if (atomic_read(&js->queue[i].sched.credit_count))
return false;
}
......
......@@ -51,7 +51,7 @@ DECLARE_EVENT_CLASS(drm_sched_job,
__assign_str(name, sched_job->sched->name);
__entry->job_count = spsc_queue_count(&entity->job_queue);
__entry->hw_job_count = atomic_read(
&sched_job->sched->hw_rq_count);
&sched_job->sched->credit_count);
),
TP_printk("entity=%p, id=%llu, fence=%p, ring=%s, job count:%u, hw job count:%d",
__entry->entity, __entry->id,
......
This diff is collapsed.
......@@ -418,7 +418,7 @@ v3d_job_init(struct v3d_dev *v3d, struct drm_file *file_priv,
job->file = file_priv;
ret = drm_sched_job_init(&job->base, &v3d_priv->sched_entity[queue],
v3d_priv);
1, v3d_priv);
if (ret)
goto fail;
......
......@@ -321,6 +321,7 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
* @sched: the scheduler instance on which this job is scheduled.
* @s_fence: contains the fences for the scheduling of job.
* @finish_cb: the callback for the finished fence.
* @credits: the number of credits this job contributes to the scheduler
* @work: Helper to reschdeule job kill to different context.
* @id: a unique id assigned to each job scheduled on the scheduler.
* @karma: increment on every hang caused by this job. If this exceeds the hang
......@@ -340,6 +341,8 @@ struct drm_sched_job {
struct drm_gpu_scheduler *sched;
struct drm_sched_fence *s_fence;
u32 credits;
/*
* work is used only after finish_cb has been used and will not be
* accessed anymore.
......@@ -463,13 +466,27 @@ struct drm_sched_backend_ops {
* and it's time to clean it up.
*/
void (*free_job)(struct drm_sched_job *sched_job);
/**
* @update_job_credits: Called when the scheduler is considering this
* job for execution.
*
* This callback returns the number of credits the job would take if
* pushed to the hardware. Drivers may use this to dynamically update
* the job's credit count. For instance, deduct the number of credits
* for already signalled native fences.
*
* This callback is optional.
*/
u32 (*update_job_credits)(struct drm_sched_job *sched_job);
};
/**
* struct drm_gpu_scheduler - scheduler instance-specific data
*
* @ops: backend operations provided by the driver.
* @hw_submission_limit: the max size of the hardware queue.
* @credit_limit: the credit limit of this scheduler
* @credit_count: the current credit count of this scheduler
* @timeout: the time after which a job is removed from the scheduler.
* @name: name of the ring for which this scheduler is being used.
* @num_rqs: Number of run-queues. This is at most DRM_SCHED_PRIORITY_COUNT,
......@@ -478,7 +495,6 @@ struct drm_sched_backend_ops {
* @job_scheduled: once @drm_sched_entity_do_release is called the scheduler
* waits on this wait queue until all the scheduled jobs are
* finished.
* @hw_rq_count: the number of jobs currently in the hardware queue.
* @job_id_count: used to assign unique id to the each job.
* @submit_wq: workqueue used to queue @work_run_job and @work_free_job
* @timeout_wq: workqueue used to queue @work_tdr
......@@ -502,13 +518,13 @@ struct drm_sched_backend_ops {
*/
struct drm_gpu_scheduler {
const struct drm_sched_backend_ops *ops;
uint32_t hw_submission_limit;
u32 credit_limit;
atomic_t credit_count;
long timeout;
const char *name;
u32 num_rqs;
struct drm_sched_rq **sched_rq;
wait_queue_head_t job_scheduled;
atomic_t hw_rq_count;
atomic64_t job_id_count;
struct workqueue_struct *submit_wq;
struct workqueue_struct *timeout_wq;
......@@ -530,14 +546,14 @@ struct drm_gpu_scheduler {
int drm_sched_init(struct drm_gpu_scheduler *sched,
const struct drm_sched_backend_ops *ops,
struct workqueue_struct *submit_wq,
u32 num_rqs, uint32_t hw_submission, unsigned int hang_limit,
u32 num_rqs, u32 credit_limit, unsigned int hang_limit,
long timeout, struct workqueue_struct *timeout_wq,
atomic_t *score, const char *name, struct device *dev);
void drm_sched_fini(struct drm_gpu_scheduler *sched);
int drm_sched_job_init(struct drm_sched_job *job,
struct drm_sched_entity *entity,
void *owner);
u32 credits, void *owner);
void drm_sched_job_arm(struct drm_sched_job *job);
int drm_sched_job_add_dependency(struct drm_sched_job *job,
struct dma_fence *fence);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment