Commit c41166f9 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Beware temporary wedging when determining -EIO

At a few points in our uABI, we check to see if the driver is wedged and
report -EIO back to the user in that case. However, as we perform the
check and reset asynchronously (where once before they were both
serialised by the struct_mutex), we may instead see the temporary wedging
used to cancel inflight rendering to avoid a deadlock during reset
(caused by either us timing out in our reset handler,
i915_wedge_on_timeout or with malice aforethought in intel_reset_prepare
for a stuck modeset). If we suspect this is the case, that is we see a
wedged driver *and* reset in progress, then wait until the reset is
resolved before reporting upon the wedged status.

v2: might_sleep() (Mika)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190220145637.23503-1-chris@chris-wilson.co.uk
parent 47ed55a9
......@@ -3833,11 +3833,18 @@ static const struct file_operations i915_cur_wm_latency_fops = {
static int
i915_wedged_get(void *data, u64 *val)
{
struct drm_i915_private *dev_priv = data;
*val = i915_terminally_wedged(&dev_priv->gpu_error);
int ret = i915_terminally_wedged(data);
switch (ret) {
case -EIO:
*val = 1;
return 0;
case 0:
*val = 0;
return 0;
default:
return ret;
}
}
static int
......@@ -3918,7 +3925,7 @@ i915_drop_caches_set(void *data, u64 val)
mutex_unlock(&i915->drm.struct_mutex);
}
if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(i915))
i915_handle_error(i915, ALL_ENGINES, 0, NULL);
fs_reclaim_acquire(GFP_KERNEL);
......
......@@ -3020,11 +3020,16 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
struct i915_request *
i915_gem_find_active_request(struct intel_engine_cs *engine);
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
static inline bool __i915_wedged(struct i915_gpu_error *error)
{
return unlikely(test_bit(I915_WEDGED, &error->flags));
}
static inline bool i915_reset_failed(struct drm_i915_private *i915)
{
return __i915_wedged(&i915->gpu_error);
}
static inline u32 i915_reset_count(struct i915_gpu_error *error)
{
return READ_ONCE(error->reset_count);
......
......@@ -1928,7 +1928,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
* fail). But any other -EIO isn't ours (e.g. swap in failure)
* and so needs to be reported.
*/
if (!i915_terminally_wedged(&dev_priv->gpu_error))
if (!i915_terminally_wedged(dev_priv))
return VM_FAULT_SIGBUS;
/* else: fall through */
case -EAGAIN:
......@@ -2958,7 +2958,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
struct intel_engine_cs *engine;
enum intel_engine_id id;
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
return;
GEM_BUG_ON(i915->gt.active_requests);
......@@ -3806,8 +3806,9 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
long ret;
/* ABI: return -EIO if already wedged */
if (i915_terminally_wedged(&dev_priv->gpu_error))
return -EIO;
ret = i915_terminally_wedged(dev_priv);
if (ret)
return ret;
spin_lock(&file_priv->mm.lock);
list_for_each_entry(request, &file_priv->mm.request_list, client_link) {
......@@ -4460,7 +4461,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
* back to defaults, recovering from whatever wedged state we left it
* in and so worth trying to use the device once more.
*/
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
i915_gem_unset_wedged(i915);
/*
......@@ -4504,7 +4505,7 @@ int i915_gem_suspend(struct drm_i915_private *i915)
* state. Fortunately, the kernel_context is disposable and we do
* not rely on its state.
*/
if (!i915_terminally_wedged(&i915->gpu_error)) {
if (!i915_reset_failed(i915)) {
ret = i915_gem_switch_to_kernel_context(i915);
if (ret)
goto err_unlock;
......@@ -4625,8 +4626,9 @@ void i915_gem_resume(struct drm_i915_private *i915)
return;
err_wedged:
if (!i915_terminally_wedged(&i915->gpu_error)) {
DRM_ERROR("failed to re-initialize GPU, declaring wedged!\n");
if (!i915_reset_failed(i915)) {
dev_err(i915->drm.dev,
"Failed to re-initialize GPU, declaring it wedged!\n");
i915_gem_set_wedged(i915);
}
goto out_unlock;
......@@ -4731,10 +4733,9 @@ int i915_gem_init_hw(struct drm_i915_private *dev_priv)
init_unused_rings(dev_priv);
BUG_ON(!dev_priv->kernel_context);
if (i915_terminally_wedged(&dev_priv->gpu_error)) {
ret = -EIO;
ret = i915_terminally_wedged(dev_priv);
if (ret)
goto out;
}
ret = i915_ppgtt_init_hw(dev_priv);
if (ret) {
......@@ -5107,7 +5108,7 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
* wedged. But we only want to do this where the GPU is angry,
* for all other failure, such as an allocation failure, bail.
*/
if (!i915_terminally_wedged(&dev_priv->gpu_error)) {
if (!i915_reset_failed(dev_priv)) {
i915_load_error(dev_priv,
"Failed to initialize GPU, declaring it wedged!\n");
i915_gem_set_wedged(dev_priv);
......
......@@ -559,8 +559,9 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
* ABI: Before userspace accesses the GPU (e.g. execbuffer), report
* EIO if the GPU is already wedged.
*/
if (i915_terminally_wedged(&i915->gpu_error))
return ERR_PTR(-EIO);
ret = i915_terminally_wedged(i915);
if (ret)
return ERR_PTR(ret);
/*
* Pinning the contexts may generate requests in order to acquire
......
......@@ -1032,7 +1032,7 @@ void i915_reset(struct drm_i915_private *i915,
finish:
reset_finish(i915);
if (!i915_terminally_wedged(error))
if (!__i915_wedged(error))
reset_restart(i915);
return;
......@@ -1253,7 +1253,7 @@ void i915_handle_error(struct drm_i915_private *i915,
* Try engine reset when available. We fall back to full reset if
* single reset fails.
*/
if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
if (intel_has_reset_engine(i915) && !__i915_wedged(error)) {
for_each_engine_masked(engine, i915, engine_mask, tmp) {
BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
......@@ -1339,6 +1339,31 @@ __releases(&i915->gpu_error.reset_backoff_srcu)
srcu_read_unlock(&error->reset_backoff_srcu, tag);
}
int i915_terminally_wedged(struct drm_i915_private *i915)
{
struct i915_gpu_error *error = &i915->gpu_error;
might_sleep();
if (!__i915_wedged(error))
return 0;
/* Reset still in progress? Maybe we will recover? */
if (!test_bit(I915_RESET_BACKOFF, &error->flags))
return -EIO;
/* XXX intel_reset_finish() still takes struct_mutex!!! */
if (mutex_is_locked(&i915->drm.struct_mutex))
return -EAGAIN;
if (wait_event_interruptible(error->reset_queue,
!test_bit(I915_RESET_BACKOFF,
&error->flags)))
return -EINTR;
return __i915_wedged(error) ? -EIO : 0;
}
bool i915_reset_flush(struct drm_i915_private *i915)
{
int err;
......
......@@ -36,6 +36,8 @@ bool i915_reset_flush(struct drm_i915_private *i915);
int __must_check i915_reset_trylock(struct drm_i915_private *i915);
void i915_reset_unlock(struct drm_i915_private *i915, int tag);
int i915_terminally_wedged(struct drm_i915_private *i915);
bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);
......
......@@ -1007,10 +1007,8 @@ static bool ring_is_idle(struct intel_engine_cs *engine)
*/
bool intel_engine_is_idle(struct intel_engine_cs *engine)
{
struct drm_i915_private *dev_priv = engine->i915;
/* More white lies, if wedged, hw state is inconsistent */
if (i915_terminally_wedged(&dev_priv->gpu_error))
if (i915_reset_failed(engine->i915))
return true;
/* Any inflight/incomplete requests? */
......@@ -1054,7 +1052,7 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
* If the driver is wedged, HW state may be very inconsistent and
* report that it is still busy, even though we have stopped using it.
*/
if (i915_terminally_wedged(&dev_priv->gpu_error))
if (i915_reset_failed(dev_priv))
return true;
for_each_engine(engine, dev_priv, id) {
......@@ -1496,7 +1494,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
va_end(ap);
}
if (i915_terminally_wedged(&engine->i915->gpu_error))
if (i915_reset_failed(engine->i915))
drm_printf(m, "*** WEDGED ***\n");
drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
......
......@@ -263,7 +263,7 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
if (!READ_ONCE(dev_priv->gt.awake))
return;
if (i915_terminally_wedged(&dev_priv->gpu_error))
if (i915_terminally_wedged(dev_priv))
return;
/* As enabling the GPU requires fairly extensive mmio access,
......
......@@ -1764,7 +1764,7 @@ int i915_gem_huge_page_live_selftests(struct drm_i915_private *dev_priv)
return 0;
}
if (i915_terminally_wedged(&dev_priv->gpu_error))
if (i915_terminally_wedged(dev_priv))
return 0;
file = mock_file(dev_priv);
......
......@@ -150,7 +150,7 @@ int i915_active_live_selftests(struct drm_i915_private *i915)
SUBTEST(live_active_retire),
};
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
......
......@@ -248,12 +248,12 @@ static bool always_valid(struct drm_i915_private *i915)
static bool needs_fence_registers(struct drm_i915_private *i915)
{
return !i915_terminally_wedged(&i915->gpu_error);
return !i915_terminally_wedged(i915);
}
static bool needs_mi_store_dword(struct drm_i915_private *i915)
{
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return false;
return intel_engine_can_store_dword(i915->engine[RCS]);
......
......@@ -1632,7 +1632,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
SUBTEST(igt_vm_isolation),
};
if (i915_terminally_wedged(&dev_priv->gpu_error))
if (i915_terminally_wedged(dev_priv))
return 0;
return i915_subtests(tests, dev_priv);
......
......@@ -547,7 +547,7 @@ int i915_gem_evict_live_selftests(struct drm_i915_private *i915)
SUBTEST(igt_evict_contexts),
};
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
......
......@@ -583,7 +583,7 @@ static int igt_mmap_offset_exhaustion(void *arg)
for (loop = 0; loop < 3; loop++) {
intel_wakeref_t wakeref;
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
break;
obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
......
......@@ -1246,7 +1246,7 @@ int i915_request_live_selftests(struct drm_i915_private *i915)
SUBTEST(live_breadcrumbs_smoketest),
};
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
......
......@@ -29,5 +29,5 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
i915_gem_set_wedged(i915);
}
return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
return i915_terminally_wedged(i915);
}
......@@ -342,7 +342,7 @@ static int igt_hang_sanitycheck(void *arg)
timeout = i915_request_wait(rq,
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
timeout = -EIO;
i915_request_put(rq);
......@@ -383,7 +383,7 @@ static int igt_global_reset(void *arg)
igt_global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
err = -EIO;
return err;
......@@ -401,13 +401,13 @@ static int igt_wedged_reset(void *arg)
i915_gem_set_wedged(i915);
GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
GEM_BUG_ON(!i915_reset_failed(i915));
i915_reset(i915, ALL_ENGINES, NULL);
intel_runtime_pm_put(i915, wakeref);
igt_global_reset_unlock(i915);
return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
return i915_reset_failed(i915) ? -EIO : 0;
}
static bool wait_for_idle(struct intel_engine_cs *engine)
......@@ -529,7 +529,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
err = -EIO;
if (active) {
......@@ -843,7 +843,7 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
break;
}
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
err = -EIO;
if (flags & TEST_ACTIVE) {
......@@ -969,7 +969,7 @@ static int igt_reset_wait(void *arg)
mutex_unlock(&i915->drm.struct_mutex);
igt_global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
return -EIO;
return err;
......@@ -1166,7 +1166,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
unlock:
mutex_unlock(&i915->drm.struct_mutex);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
return -EIO;
return err;
......@@ -1372,7 +1372,7 @@ static int igt_reset_queue(void *arg)
mutex_unlock(&i915->drm.struct_mutex);
igt_global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
return -EIO;
return err;
......@@ -1552,7 +1552,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
i915_request_wait(rq,
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
err = -EIO;
}
......@@ -1591,7 +1591,7 @@ static int igt_atomic_reset(void *arg)
/* Flush any requests before we get started and check basics */
force_reset(i915);
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_reset_failed(i915))
goto unlock;
if (intel_has_gpu_reset(i915)) {
......@@ -1665,7 +1665,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
if (!intel_has_gpu_reset(i915))
return 0;
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return -EIO; /* we're long past hope of a successful reset */
wakeref = intel_runtime_pm_get(i915);
......
......@@ -895,7 +895,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
if (!HAS_EXECLISTS(i915))
return 0;
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return 0;
return i915_subtests(tests, i915);
......
......@@ -181,7 +181,7 @@ static int check_whitelist(struct i915_gem_context *ctx,
err = 0;
igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */
err = i915_gem_object_set_to_cpu_domain(results, false);
if (i915_terminally_wedged(&ctx->i915->gpu_error))
if (i915_terminally_wedged(ctx->i915))
err = -EIO;
if (err)
goto out_put;
......@@ -510,7 +510,7 @@ int intel_workarounds_live_selftests(struct drm_i915_private *i915)
};
int err;
if (i915_terminally_wedged(&i915->gpu_error))
if (i915_terminally_wedged(i915))
return 0;
mutex_lock(&i915->drm.struct_mutex);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment