Commit e2ccf0d0 authored by Chris Wilson's avatar Chris Wilson

drm/i915/execlists: Double check breadcrumb before crying foul

  process_csb: 0000:00:02.0 bcs0: cs-irq head=4, tail=5
  process_csb: 0000:00:02.0 bcs0: csb[5]: status=0x00008002:0x60000020
  trace_ports: 0000:00:02.0 bcs0: preempted { ff84:45154! prio 2 }
  trace_ports: 0000:00:02.0 bcs0: promote { ff84:45155* prio 2 }
  trace_ports: 0000:00:02.0 bcs0: submit { ff84:45156 prio 2 }

  process_csb: 0000:00:02.0 bcs0: cs-irq head=5, tail=6
  process_csb: 0000:00:02.0 bcs0: csb[6]: status=0x00000018:0x60000020
  trace_ports: 0000:00:02.0 bcs0: completed { ff84:45155* prio 2 }
  process_csb: 0000:00:02.0 bcs0: ring:{start:0x00178000, head:0928, tail:0928, ctl:00000000, mode:00000200}
  process_csb: 0000:00:02.0 bcs0: rq:{start:00178000, head:08b0, tail:08f0, seqno:ff84:45155, hwsp:45156},
  process_csb: 0000:00:02.0 bcs0: ctx:{start:00178000, head:e000928, tail:0928},
  process_csb: GEM_BUG_ON("context completed before request")

In this sequence, we can see that although we have submitted the next
request [ff84:45156] to HW (via ELSP[]) it has not yet reported the
lite-restore. Instead, we see the completion event of the currently
active request [ff84:45155] but at the time of processing that event,
the breadcrumb has not yet been written. Though by the time we do print
out the debug info, the seqno write of ff84:45156 has landed!

Therefore there is a serialisation problem between the seqno writes and
CS events, not just between the CS buffer and its head/tail pointers as
previously observed on Icelake.

This is not a huge problem, as we don't strictly rely on the breadcrumb
to determine HW activity, but it may indicate that interrupt delivery is
before the seqno write, aka bringing back the plague of missed
interrupts from yesteryear. However, there is no indication of this
wider problem, so let's just flush the seqno read before reporting an
error. If it persists after the fresh read we can worry again.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Reviewed-by: default avatarMika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200330234318.30638-1-chris@chris-wilson.co.uk
parent d16e137e
......@@ -2342,6 +2342,13 @@ gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
}
static inline void flush_hwsp(const struct i915_request *rq)
{
mb();
clflush((void *)READ_ONCE(rq->hwsp_seqno));
mb();
}
static void process_csb(struct intel_engine_cs *engine)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
......@@ -2418,8 +2425,6 @@ static void process_csb(struct intel_engine_cs *engine)
if (promote) {
struct i915_request * const *old = execlists->active;
GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
ring_set_paused(engine, 0);
/* Point active to the new ELSP; prevent overwriting */
......@@ -2432,6 +2437,7 @@ static void process_csb(struct intel_engine_cs *engine)
execlists_schedule_out(*old++);
/* switch pending to inflight */
GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
memcpy(execlists->inflight,
execlists->pending,
execlists_num_ports(execlists) *
......@@ -2453,13 +2459,24 @@ static void process_csb(struct intel_engine_cs *engine)
* user interrupt and CSB is processed.
*/
if (GEM_SHOW_DEBUG() &&
!i915_request_completed(*execlists->active) &&
!reset_in_progress(execlists)) {
struct i915_request *rq __maybe_unused =
*execlists->active;
!i915_request_completed(*execlists->active)) {
struct i915_request *rq = *execlists->active;
const u32 *regs __maybe_unused =
rq->context->lrc_reg_state;
/*
* Flush the breadcrumb before crying foul.
*
* Since we have hit this on icl and seen the
* breadcrumb advance as we print out the debug
* info (so the problem corrected itself without
* lasting damage), and we know that icl suffers
* from missing global observation points in
* execlists, presume that affects even more
* coherency.
*/
flush_hwsp(rq);
ENGINE_TRACE(engine,
"ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
ENGINE_READ(engine, RING_START),
......@@ -2480,7 +2497,10 @@ static void process_csb(struct intel_engine_cs *engine)
regs[CTX_RING_HEAD],
regs[CTX_RING_TAIL]);
GEM_BUG_ON("context completed before request");
/* Still? Declare it caput! */
if (!i915_request_completed(rq) &&
!reset_in_progress(execlists))
GEM_BUG_ON("context completed before request");
}
execlists_schedule_out(*execlists->active++);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment