Commit 0a7d355e authored by Chris Wilson's avatar Chris Wilson

drm/i915/gt: Allow failed resets without assertion

If the engine reset fails, we will attempt to resume with the current
inflight submissions. When that happens, we cannot assert that the
engine reset cleared the pending submission, so do not.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2878
Fixes: 16f2941a ("drm/i915/gt: Replace direct submit with direct call to tasklet")
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: default avatarAndi Shyti <andi.shyti@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210104115145.24460-3-chris@chris-wilson.co.uk
parent c864e9ab
...@@ -561,6 +561,8 @@ struct intel_engine_cs { ...@@ -561,6 +561,8 @@ struct intel_engine_cs {
unsigned long stop_timeout_ms; unsigned long stop_timeout_ms;
unsigned long timeslice_duration_ms; unsigned long timeslice_duration_ms;
} props, defaults; } props, defaults;
I915_SELFTEST_DECLARE(struct fault_attr reset_timeout);
}; };
static inline bool static inline bool
......
...@@ -3047,9 +3047,13 @@ static void execlists_reset_finish(struct intel_engine_cs *engine) ...@@ -3047,9 +3047,13 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
* After a GPU reset, we may have requests to replay. Do so now while * After a GPU reset, we may have requests to replay. Do so now while
* we still have the forcewake to be sure that the GPU is not allowed * we still have the forcewake to be sure that the GPU is not allowed
* to sleep before we restart and reload a context. * to sleep before we restart and reload a context.
*
* If the GPU reset fails, the engine may still be alive with requests
* inflight. We expect those to complete, or for the device to be
* reset as the next level of recovery, and as a final resort we
* will declare the device wedged.
*/ */
GEM_BUG_ON(!reset_in_progress(execlists)); GEM_BUG_ON(!reset_in_progress(execlists));
GEM_BUG_ON(engine->execlists.pending[0]);
/* And kick in case we missed a new request submission. */ /* And kick in case we missed a new request submission. */
if (__tasklet_enable(&execlists->tasklet)) if (__tasklet_enable(&execlists->tasklet))
......
...@@ -497,6 +497,9 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) ...@@ -497,6 +497,9 @@ static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
u32 request, mask, ack; u32 request, mask, ack;
int ret; int ret;
if (I915_SELFTEST_ONLY(should_fail(&engine->reset_timeout, 1)))
return -ETIMEDOUT;
ack = intel_uncore_read_fw(uncore, reg); ack = intel_uncore_read_fw(uncore, reg);
if (ack & RESET_CTL_CAT_ERROR) { if (ack & RESET_CTL_CAT_ERROR) {
/* /*
......
...@@ -2299,6 +2299,77 @@ static int __cancel_hostile(struct live_preempt_cancel *arg) ...@@ -2299,6 +2299,77 @@ static int __cancel_hostile(struct live_preempt_cancel *arg)
return err; return err;
} }
static void force_reset_timeout(struct intel_engine_cs *engine)
{
engine->reset_timeout.probability = 999;
atomic_set(&engine->reset_timeout.times, -1);
}
static void cancel_reset_timeout(struct intel_engine_cs *engine)
{
memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
}
static int __cancel_fail(struct live_preempt_cancel *arg)
{
struct intel_engine_cs *engine = arg->engine;
struct i915_request *rq;
int err;
if (!IS_ACTIVE(CONFIG_DRM_I915_PREEMPT_TIMEOUT))
return 0;
if (!intel_has_reset_engine(engine->gt))
return 0;
GEM_TRACE("%s(%s)\n", __func__, engine->name);
rq = spinner_create_request(&arg->a.spin,
arg->a.ctx, engine,
MI_NOOP); /* preemption disabled */
if (IS_ERR(rq))
return PTR_ERR(rq);
clear_bit(CONTEXT_BANNED, &rq->context->flags);
i915_request_get(rq);
i915_request_add(rq);
if (!igt_wait_for_spinner(&arg->a.spin, rq)) {
err = -EIO;
goto out;
}
intel_context_set_banned(rq->context);
err = intel_engine_pulse(engine);
if (err)
goto out;
force_reset_timeout(engine);
/* force preempt reset [failure] */
while (!engine->execlists.pending[0])
intel_engine_flush_submission(engine);
del_timer_sync(&engine->execlists.preempt);
intel_engine_flush_submission(engine);
cancel_reset_timeout(engine);
/* after failure, require heartbeats to reset device */
intel_engine_set_heartbeat(engine, 1);
err = wait_for_reset(engine, rq, HZ / 2);
intel_engine_set_heartbeat(engine,
engine->defaults.heartbeat_interval_ms);
if (err) {
pr_err("Cancelled inflight0 request did not reset\n");
goto out;
}
out:
i915_request_put(rq);
if (igt_flush_test(engine->i915))
err = -EIO;
return err;
}
static int live_preempt_cancel(void *arg) static int live_preempt_cancel(void *arg)
{ {
struct intel_gt *gt = arg; struct intel_gt *gt = arg;
...@@ -2338,6 +2409,10 @@ static int live_preempt_cancel(void *arg) ...@@ -2338,6 +2409,10 @@ static int live_preempt_cancel(void *arg)
err = __cancel_hostile(&data); err = __cancel_hostile(&data);
if (err) if (err)
goto err_wedged; goto err_wedged;
err = __cancel_fail(&data);
if (err)
goto err_wedged;
} }
err = 0; err = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment