Commit 1a370be9 authored by Rob Clark's avatar Rob Clark

drm/msm: restart queued submits after hang

Track the list of in-flight submits.  If the gpu hangs, retire up to an
including the offending submit, and then re-submit the remainder.  This
way, for concurrently running piglit tests (for example), one failing
test doesn't cause unrelated tests to fail simply because it's submit
was queued up after one that triggered a hang.
Signed-off-by: default avatarRob Clark <robdclark@gmail.com>
parent 56c2da83
...@@ -96,6 +96,7 @@ static inline uint32_t msm_gem_fence(struct msm_gem_object *msm_obj, ...@@ -96,6 +96,7 @@ static inline uint32_t msm_gem_fence(struct msm_gem_object *msm_obj,
struct msm_gem_submit { struct msm_gem_submit {
struct drm_device *dev; struct drm_device *dev;
struct msm_gpu *gpu; struct msm_gpu *gpu;
struct list_head node; /* node in gpu submit_list */
struct list_head bo_list; struct list_head bo_list;
struct ww_acquire_ctx ticket; struct ww_acquire_ctx ticket;
uint32_t fence; uint32_t fence;
......
...@@ -314,7 +314,6 @@ static void submit_cleanup(struct msm_gem_submit *submit, bool fail) ...@@ -314,7 +314,6 @@ static void submit_cleanup(struct msm_gem_submit *submit, bool fail)
} }
ww_acquire_fini(&submit->ticket); ww_acquire_fini(&submit->ticket);
kfree(submit);
} }
int msm_ioctl_gem_submit(struct drm_device *dev, void *data, int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
......
...@@ -265,6 +265,8 @@ static void inactive_start(struct msm_gpu *gpu) ...@@ -265,6 +265,8 @@ static void inactive_start(struct msm_gpu *gpu)
* Hangcheck detection for locked gpu: * Hangcheck detection for locked gpu:
*/ */
static void retire_submits(struct msm_gpu *gpu, uint32_t fence);
static void recover_worker(struct work_struct *work) static void recover_worker(struct work_struct *work)
{ {
struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work); struct msm_gpu *gpu = container_of(work, struct msm_gpu, recover_work);
...@@ -274,8 +276,19 @@ static void recover_worker(struct work_struct *work) ...@@ -274,8 +276,19 @@ static void recover_worker(struct work_struct *work)
mutex_lock(&dev->struct_mutex); mutex_lock(&dev->struct_mutex);
if (msm_gpu_active(gpu)) { if (msm_gpu_active(gpu)) {
struct msm_gem_submit *submit;
uint32_t fence = gpu->funcs->last_fence(gpu);
/* retire completed submits, plus the one that hung: */
retire_submits(gpu, fence + 1);
inactive_cancel(gpu); inactive_cancel(gpu);
gpu->funcs->recover(gpu); gpu->funcs->recover(gpu);
/* replay the remaining submits after the one that hung: */
list_for_each_entry(submit, &gpu->submit_list, node) {
gpu->funcs->submit(gpu, submit, NULL);
}
} }
mutex_unlock(&dev->struct_mutex); mutex_unlock(&dev->struct_mutex);
...@@ -418,6 +431,27 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime, ...@@ -418,6 +431,27 @@ int msm_gpu_perfcntr_sample(struct msm_gpu *gpu, uint32_t *activetime,
* Cmdstream submission/retirement: * Cmdstream submission/retirement:
*/ */
static void retire_submits(struct msm_gpu *gpu, uint32_t fence)
{
struct drm_device *dev = gpu->dev;
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
while (!list_empty(&gpu->submit_list)) {
struct msm_gem_submit *submit;
submit = list_first_entry(&gpu->submit_list,
struct msm_gem_submit, node);
if (submit->fence <= fence) {
list_del(&submit->node);
kfree(submit);
} else {
break;
}
}
}
static void retire_worker(struct work_struct *work) static void retire_worker(struct work_struct *work)
{ {
struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work); struct msm_gpu *gpu = container_of(work, struct msm_gpu, retire_work);
...@@ -428,6 +462,8 @@ static void retire_worker(struct work_struct *work) ...@@ -428,6 +462,8 @@ static void retire_worker(struct work_struct *work)
mutex_lock(&dev->struct_mutex); mutex_lock(&dev->struct_mutex);
retire_submits(gpu, fence);
while (!list_empty(&gpu->active_list)) { while (!list_empty(&gpu->active_list)) {
struct msm_gem_object *obj; struct msm_gem_object *obj;
...@@ -467,21 +503,22 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, ...@@ -467,21 +503,22 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
struct msm_drm_private *priv = dev->dev_private; struct msm_drm_private *priv = dev->dev_private;
int i, ret; int i, ret;
WARN_ON(!mutex_is_locked(&dev->struct_mutex));
submit->fence = ++priv->next_fence; submit->fence = ++priv->next_fence;
gpu->submitted_fence = submit->fence; gpu->submitted_fence = submit->fence;
inactive_cancel(gpu); inactive_cancel(gpu);
list_add_tail(&submit->node, &gpu->submit_list);
msm_rd_dump_submit(submit); msm_rd_dump_submit(submit);
gpu->submitted_fence = submit->fence; gpu->submitted_fence = submit->fence;
update_sw_cntrs(gpu); update_sw_cntrs(gpu);
ret = gpu->funcs->submit(gpu, submit, ctx);
priv->lastctx = ctx;
for (i = 0; i < submit->nr_bos; i++) { for (i = 0; i < submit->nr_bos; i++) {
struct msm_gem_object *msm_obj = submit->bos[i].obj; struct msm_gem_object *msm_obj = submit->bos[i].obj;
...@@ -505,6 +542,10 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit, ...@@ -505,6 +542,10 @@ int msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE) if (submit->bos[i].flags & MSM_SUBMIT_BO_WRITE)
msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->fence); msm_gem_move_to_active(&msm_obj->base, gpu, true, submit->fence);
} }
ret = gpu->funcs->submit(gpu, submit, ctx);
priv->lastctx = ctx;
hangcheck_timer_reset(gpu); hangcheck_timer_reset(gpu);
return ret; return ret;
...@@ -545,6 +586,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, ...@@ -545,6 +586,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
INIT_WORK(&gpu->inactive_work, inactive_worker); INIT_WORK(&gpu->inactive_work, inactive_worker);
INIT_WORK(&gpu->recover_work, recover_worker); INIT_WORK(&gpu->recover_work, recover_worker);
INIT_LIST_HEAD(&gpu->submit_list);
setup_timer(&gpu->inactive_timer, inactive_handler, setup_timer(&gpu->inactive_timer, inactive_handler,
(unsigned long)gpu); (unsigned long)gpu);
setup_timer(&gpu->hangcheck_timer, hangcheck_handler, setup_timer(&gpu->hangcheck_timer, hangcheck_handler,
......
...@@ -119,6 +119,8 @@ struct msm_gpu { ...@@ -119,6 +119,8 @@ struct msm_gpu {
struct timer_list hangcheck_timer; struct timer_list hangcheck_timer;
uint32_t hangcheck_fence; uint32_t hangcheck_fence;
struct work_struct recover_work; struct work_struct recover_work;
struct list_head submit_list;
}; };
static inline bool msm_gpu_active(struct msm_gpu *gpu) static inline bool msm_gpu_active(struct msm_gpu *gpu)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment