Commit 21182b3c authored by Chris Wilson's avatar Chris Wilson

drm/i915: Don't claim an unstarted request was guilty

If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.
v3: Preserve semaphores on skipping requests (need to keep the timelines
intact).
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-7-chris@chris-wilson.co.uk
parent 72eb16df
...@@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) ...@@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
*cs++ = rq->fence.seqno - 1; *cs++ = rq->fence.seqno - 1;
intel_ring_advance(rq, cs); intel_ring_advance(rq, cs);
/* Record the updated position of the request's payload */
rq->infix = intel_ring_offset(rq, cs);
return 0; return 0;
} }
...@@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine) ...@@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
spin_unlock_irqrestore(&engine->timeline.lock, flags); spin_unlock_irqrestore(&engine->timeline.lock, flags);
} }
static bool lrc_regs_ok(const struct i915_request *rq)
{
const struct intel_ring *ring = rq->ring;
const u32 *regs = rq->hw_context->lrc_reg_state;
/* Quick spot check for the common signs of context corruption */
if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
(RING_CTL_SIZE(ring->size) | RING_VALID))
return false;
if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
return false;
return true;
}
static void execlists_reset(struct intel_engine_cs *engine, bool stalled) static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
{ {
struct intel_engine_execlists * const execlists = &engine->execlists; struct intel_engine_execlists * const execlists = &engine->execlists;
...@@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled) ...@@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
if (!rq) if (!rq)
goto out_unlock; goto out_unlock;
/*
* If this request hasn't started yet, e.g. it is waiting on a
* semaphore, we need to avoid skipping the request or else we
* break the signaling chain. However, if the context is corrupt
* the request will not restart and we will be stuck with a wedged
* device. It is quite often the case that if we issue a reset
* while the GPU is loading the context image, that the context
* image becomes corrupt.
*
* Otherwise, if we have not started yet, the request should replay
* perfectly and we do not need to flag the result as being erroneous.
*/
if (!i915_request_started(rq) && lrc_regs_ok(rq))
goto out_unlock;
/* /*
* If the request was innocent, we leave the request in the ELSP * If the request was innocent, we leave the request in the ELSP
* and will try to replay it on restarting. The context image may * and will try to replay it on restarting. The context image may
...@@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled) ...@@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
* image back to the expected values to skip over the guilty request. * image back to the expected values to skip over the guilty request.
*/ */
i915_reset_request(rq, stalled); i915_reset_request(rq, stalled);
if (!stalled) if (!stalled && lrc_regs_ok(rq))
goto out_unlock; goto out_unlock;
/* /*
...@@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled) ...@@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
engine->context_size - PAGE_SIZE); engine->context_size - PAGE_SIZE);
} }
/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */ /* Rerun the request; its payload has been neutered (if guilty). */
rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix); rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
intel_ring_update_space(rq->ring); intel_ring_update_space(rq->ring);
execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring); execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
......
...@@ -144,6 +144,13 @@ igt_spinner_create_request(struct igt_spinner *spin, ...@@ -144,6 +144,13 @@ igt_spinner_create_request(struct igt_spinner *spin,
i915_gem_chipset_flush(spin->i915); i915_gem_chipset_flush(spin->i915);
if (engine->emit_init_breadcrumb &&
rq->timeline->has_initial_breadcrumb) {
err = engine->emit_init_breadcrumb(rq);
if (err)
goto cancel_rq;
}
err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
cancel_rq: cancel_rq:
......
...@@ -242,6 +242,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine) ...@@ -242,6 +242,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
*batch++ = MI_BATCH_BUFFER_END; /* not reached */ *batch++ = MI_BATCH_BUFFER_END; /* not reached */
i915_gem_chipset_flush(h->i915); i915_gem_chipset_flush(h->i915);
if (rq->engine->emit_init_breadcrumb) {
err = rq->engine->emit_init_breadcrumb(rq);
if (err)
goto cancel_rq;
}
flags = 0; flags = 0;
if (INTEL_GEN(vm->i915) <= 5) if (INTEL_GEN(vm->i915) <= 5)
flags |= I915_DISPATCH_SECURE; flags |= I915_DISPATCH_SECURE;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment