Commit 789659f4 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Drop fake breadcrumb irq

Missed breadcrumb detection is defunct due to the tight coupling with
dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129205230.19056-3-chris@chris-wilson.co.uk
parent 52c0fdb2
...@@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused) ...@@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
intel_engine_last_submit(engine), intel_engine_last_submit(engine),
jiffies_to_msecs(jiffies - jiffies_to_msecs(jiffies -
engine->hangcheck.action_timestamp)); engine->hangcheck.action_timestamp));
seq_printf(m, "\tfake irq active? %s\n",
yesno(test_bit(engine->id,
&dev_priv->gpu_error.missed_irq_rings)));
seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n", seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
(long long)engine->hangcheck.acthd, (long long)engine->hangcheck.acthd,
...@@ -3899,94 +3896,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops, ...@@ -3899,94 +3896,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set, i915_wedged_get, i915_wedged_set,
"%llu\n"); "%llu\n");
static int
fault_irq_set(struct drm_i915_private *i915,
unsigned long *irq,
unsigned long val)
{
int err;
err = mutex_lock_interruptible(&i915->drm.struct_mutex);
if (err)
return err;
err = i915_gem_wait_for_idle(i915,
I915_WAIT_LOCKED |
I915_WAIT_INTERRUPTIBLE,
MAX_SCHEDULE_TIMEOUT);
if (err)
goto err_unlock;
*irq = val;
mutex_unlock(&i915->drm.struct_mutex);
/* Flush idle worker to disarm irq */
drain_delayed_work(&i915->gt.idle_work);
return 0;
err_unlock:
mutex_unlock(&i915->drm.struct_mutex);
return err;
}
static int
i915_ring_missed_irq_get(void *data, u64 *val)
{
struct drm_i915_private *dev_priv = data;
*val = dev_priv->gpu_error.missed_irq_rings;
return 0;
}
static int
i915_ring_missed_irq_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
}
DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
i915_ring_missed_irq_get, i915_ring_missed_irq_set,
"0x%08llx\n");
static int
i915_ring_test_irq_get(void *data, u64 *val)
{
struct drm_i915_private *dev_priv = data;
*val = dev_priv->gpu_error.test_irq_rings;
return 0;
}
static int
i915_ring_test_irq_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
/* GuC keeps the user interrupt permanently enabled for submission */
if (USES_GUC_SUBMISSION(i915))
return -ENODEV;
/*
* From icl, we can no longer individually mask interrupt generation
* from each engine.
*/
if (INTEL_GEN(i915) >= 11)
return -ENODEV;
val &= INTEL_INFO(i915)->ring_mask;
DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
}
DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
i915_ring_test_irq_get, i915_ring_test_irq_set,
"0x%08llx\n");
#define DROP_UNBOUND BIT(0) #define DROP_UNBOUND BIT(0)
#define DROP_BOUND BIT(1) #define DROP_BOUND BIT(1)
#define DROP_RETIRE BIT(2) #define DROP_RETIRE BIT(2)
...@@ -4750,8 +4659,6 @@ static const struct i915_debugfs_files { ...@@ -4750,8 +4659,6 @@ static const struct i915_debugfs_files {
} i915_debugfs_files[] = { } i915_debugfs_files[] = {
{"i915_wedged", &i915_wedged_fops}, {"i915_wedged", &i915_wedged_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops},
{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
{"i915_ring_test_irq", &i915_ring_test_irq_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops},
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops}, {"i915_error_state", &i915_error_state_fops},
......
...@@ -723,8 +723,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, ...@@ -723,8 +723,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
err_printf(m, "CCID: 0x%08x\n", error->ccid); err_printf(m, "CCID: 0x%08x\n", error->ccid);
err_printf(m, "Missed interrupts: 0x%08lx\n",
m->i915->gpu_error.missed_irq_rings);
for (i = 0; i < error->nfence; i++) for (i = 0; i < error->nfence; i++)
err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
......
...@@ -204,8 +204,6 @@ struct i915_gpu_error { ...@@ -204,8 +204,6 @@ struct i915_gpu_error {
atomic_t pending_fb_pin; atomic_t pending_fb_pin;
unsigned long missed_irq_rings;
/** /**
* State variable controlling the reset flow and count * State variable controlling the reset flow and count
* *
...@@ -274,9 +272,6 @@ struct i915_gpu_error { ...@@ -274,9 +272,6 @@ struct i915_gpu_error {
*/ */
wait_queue_head_t reset_queue; wait_queue_head_t reset_queue;
/* For missed irq/seqno simulation. */
unsigned long test_irq_rings;
struct i915_gpu_restart *restart; struct i915_gpu_restart *restart;
}; };
......
...@@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine) ...@@ -91,7 +91,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
spin_lock(&b->irq_lock); spin_lock(&b->irq_lock);
b->irq_fired = true;
if (b->irq_armed && list_empty(&b->signalers)) if (b->irq_armed && list_empty(&b->signalers))
__intel_breadcrumbs_disarm_irq(b); __intel_breadcrumbs_disarm_irq(b);
...@@ -172,86 +171,6 @@ static void signal_irq_work(struct irq_work *work) ...@@ -172,86 +171,6 @@ static void signal_irq_work(struct irq_work *work)
intel_engine_breadcrumbs_irq(engine); intel_engine_breadcrumbs_irq(engine);
} }
static unsigned long wait_timeout(void)
{
return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
}
static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
{
if (GEM_SHOW_DEBUG()) {
struct drm_printer p = drm_debug_printer(__func__);
intel_engine_dump(engine, &p,
"%s missed breadcrumb at %pS\n",
engine->name, __builtin_return_address(0));
}
set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
}
static void intel_breadcrumbs_hangcheck(struct timer_list *t)
{
struct intel_engine_cs *engine =
from_timer(engine, t, breadcrumbs.hangcheck);
struct intel_breadcrumbs *b = &engine->breadcrumbs;
if (!b->irq_armed)
return;
if (b->irq_fired)
goto rearm;
/*
* We keep the hangcheck timer alive until we disarm the irq, even
* if there are no waiters at present.
*
* If the waiter was currently running, assume it hasn't had a chance
* to process the pending interrupt (e.g, low priority task on a loaded
* system) and wait until it sleeps before declaring a missed interrupt.
*
* If the waiter was asleep (and not even pending a wakeup), then we
* must have missed an interrupt as the GPU has stopped advancing
* but we still have a waiter. Assuming all batches complete within
* DRM_I915_HANGCHECK_JIFFIES [1.5s]!
*/
synchronize_hardirq(engine->i915->drm.irq);
if (intel_engine_signal_breadcrumbs(engine)) {
missed_breadcrumb(engine);
mod_timer(&b->fake_irq, jiffies + 1);
} else {
rearm:
b->irq_fired = false;
mod_timer(&b->hangcheck, wait_timeout());
}
}
static void intel_breadcrumbs_fake_irq(struct timer_list *t)
{
struct intel_engine_cs *engine =
from_timer(engine, t, breadcrumbs.fake_irq);
struct intel_breadcrumbs *b = &engine->breadcrumbs;
/*
* The timer persists in case we cannot enable interrupts,
* or if we have previously seen seqno/interrupt incoherency
* ("missed interrupt" syndrome, better known as a "missed breadcrumb").
* Here the worker will wake up every jiffie in order to kick the
* oldest waiter to do the coherent seqno check.
*/
if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
return;
/* If the user has disabled the fake-irq, restore the hangchecking */
if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
mod_timer(&b->hangcheck, wait_timeout());
return;
}
mod_timer(&b->fake_irq, jiffies + 1);
}
void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine) void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
{ {
struct intel_breadcrumbs *b = &engine->breadcrumbs; struct intel_breadcrumbs *b = &engine->breadcrumbs;
...@@ -274,43 +193,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine) ...@@ -274,43 +193,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
spin_unlock_irq(&b->irq_lock); spin_unlock_irq(&b->irq_lock);
} }
static bool use_fake_irq(const struct intel_breadcrumbs *b) static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
{
const struct intel_engine_cs *engine =
container_of(b, struct intel_engine_cs, breadcrumbs);
if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
return false;
/*
* Only start with the heavy weight fake irq timer if we have not
* seen any interrupts since enabling it the first time. If the
* interrupts are still arriving, it means we made a mistake in our
* engine->seqno_barrier(), a timing error that should be transient
* and unlikely to reoccur.
*/
return !b->irq_fired;
}
static void enable_fake_irq(struct intel_breadcrumbs *b)
{
/* Ensure we never sleep indefinitely */
if (!b->irq_enabled || use_fake_irq(b))
mod_timer(&b->fake_irq, jiffies + 1);
else
mod_timer(&b->hangcheck, wait_timeout());
}
static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
{ {
struct intel_engine_cs *engine = struct intel_engine_cs *engine =
container_of(b, struct intel_engine_cs, breadcrumbs); container_of(b, struct intel_engine_cs, breadcrumbs);
struct drm_i915_private *i915 = engine->i915;
bool enabled;
lockdep_assert_held(&b->irq_lock); lockdep_assert_held(&b->irq_lock);
if (b->irq_armed) if (b->irq_armed)
return false; return;
/* /*
* The breadcrumb irq will be disarmed on the interrupt after the * The breadcrumb irq will be disarmed on the interrupt after the
...@@ -328,16 +218,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) ...@@ -328,16 +218,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
* the driver is idle) we disarm the breadcrumbs. * the driver is idle) we disarm the breadcrumbs.
*/ */
/* No interrupts? Kick the waiter every jiffie! */ if (!b->irq_enabled++)
enabled = false;
if (!b->irq_enabled++ &&
!test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
irq_enable(engine); irq_enable(engine);
enabled = true;
}
enable_fake_irq(b);
return enabled;
} }
void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
...@@ -348,18 +230,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) ...@@ -348,18 +230,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
INIT_LIST_HEAD(&b->signalers); INIT_LIST_HEAD(&b->signalers);
init_irq_work(&b->irq_work, signal_irq_work); init_irq_work(&b->irq_work, signal_irq_work);
timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
}
static void cancel_fake_irq(struct intel_engine_cs *engine)
{
struct intel_breadcrumbs *b = &engine->breadcrumbs;
del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
del_timer_sync(&b->hangcheck);
clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
} }
void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
...@@ -369,13 +239,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) ...@@ -369,13 +239,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
spin_lock_irqsave(&b->irq_lock, flags); spin_lock_irqsave(&b->irq_lock, flags);
/*
* Leave the fake_irq timer enabled (if it is running), but clear the
* bit so that it turns itself off on its next wake up and goes back
* to the long hangcheck interval if still required.
*/
clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
if (b->irq_enabled) if (b->irq_enabled)
irq_enable(engine); irq_enable(engine);
else else
...@@ -386,7 +249,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) ...@@ -386,7 +249,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine) void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
{ {
cancel_fake_irq(engine);
} }
bool i915_request_enable_breadcrumb(struct i915_request *rq) bool i915_request_enable_breadcrumb(struct i915_request *rq)
...@@ -482,7 +344,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, ...@@ -482,7 +344,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
} }
} }
spin_unlock_irq(&b->irq_lock); spin_unlock_irq(&b->irq_lock);
if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
drm_printf(p, "Fake irq active\n");
} }
...@@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) ...@@ -275,6 +275,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
for_each_engine(engine, dev_priv, id) { for_each_engine(engine, dev_priv, id) {
struct hangcheck hc; struct hangcheck hc;
intel_engine_signal_breadcrumbs(engine);
hangcheck_load_sample(engine, &hc); hangcheck_load_sample(engine, &hc);
hangcheck_accumulate_sample(engine, &hc); hangcheck_accumulate_sample(engine, &hc);
hangcheck_store_sample(engine, &hc); hangcheck_store_sample(engine, &hc);
......
...@@ -387,14 +387,9 @@ struct intel_engine_cs { ...@@ -387,14 +387,9 @@ struct intel_engine_cs {
struct irq_work irq_work; /* for use from inside irq_lock */ struct irq_work irq_work; /* for use from inside irq_lock */
struct timer_list fake_irq; /* used after a missed interrupt */
struct timer_list hangcheck; /* detect missed interrupts */
unsigned int hangcheck_interrupts;
unsigned int irq_enabled; unsigned int irq_enabled;
bool irq_armed; bool irq_armed;
bool irq_fired;
} breadcrumbs; } breadcrumbs;
struct { struct {
......
...@@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t, ...@@ -35,7 +35,6 @@ int igt_live_test_begin(struct igt_live_test *t,
return err; return err;
} }
i915->gpu_error.missed_irq_rings = 0;
t->reset_global = i915_reset_count(&i915->gpu_error); t->reset_global = i915_reset_count(&i915->gpu_error);
for_each_engine(engine, i915, id) for_each_engine(engine, i915, id)
...@@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t) ...@@ -75,11 +74,5 @@ int igt_live_test_end(struct igt_live_test *t)
return -EIO; return -EIO;
} }
if (i915->gpu_error.missed_irq_rings) {
pr_err("%s(%s): Missed interrupts on engines %lx\n",
t->func, t->name, i915->gpu_error.missed_irq_rings);
return -EIO;
}
return 0; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment