drm/i915/ringbuffer: Move irq seqno barrier to the GPU for gen7

The irq_seqno_barrier is a tradeoff between doing work on every request (on the GPU) and doing work after every interrupt (on the CPU). We presume we have many more requests than interrupts! However, the current w/a for Ivybridge is an implicit delay that currently fails sporadically and consistently if we move the w/a into the irq handler itself. This makes the CPU barrier untenable for upcoming interrupt handler changes and so we need to replace it with a delay on the GPU before we send the MI_USER_INTERRUPT. As it turns out that delay is 32x MI_STORE_DWORD_IMM, or about 0.6us per request! Quite nasty, but the lesser of two evils looking to the future. Testcase: igt/gem_sync Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20181228171641.16531-4-chris@chris-wilson.co.uk

drm/i915/ringbuffer: Move irq seqno barrier to the GPU for gen7
The irq_seqno_barrier is a tradeoff between doing work on every request (on the GPU) and doing work after every interrupt (on the CPU). We presume we have many more requests than interrupts! However, the current w/a for Ivybridge is an implicit delay that currently fails sporadically and consistently if we move the w/a into the irq handler itself. This makes the CPU barrier untenable for upcoming interrupt handler changes and so we need to replace it with a delay on the GPU before we send the MI_USER_INTERRUPT. As it turns out that delay is 32x MI_STORE_DWORD_IMM, or about 0.6us per request! Quite nasty, but the lesser of two evils looking to the future. Testcase: igt/gem_sync Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20181228171641.16531-4-chris@chris-wilson.co.uk
1212bd82 · Chris Wilson · d9cad220 · 1212bd82
Commit 1212bd82 authored Dec 28, 2018 by Chris Wilson
Show whitespace changes
Inline Side-by-side

Showing with 44 additions and 36 deletions

drivers/gpu/drm/i915/intel_ringbuffer.c drivers/gpu/drm/i915/intel_ringbuffer.c +44 -36

No files found.
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -443,6 +443,34 @@ static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 }
 static const int gen6_xcs_emit_breadcrumb_sz = 4;
+#define GEN7_XCS_WA 32
+static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
+{
+	int i;
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
+	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
+	*cs++ = rq->global_seqno;
+	for (i = 0; i < GEN7_XCS_WA; i++) {
+		*cs++ = MI_STORE_DWORD_INDEX;
+		*cs++ = I915_GEM_HWS_INDEX_ADDR;
+		*cs++ = rq->global_seqno;
+	}
+	*cs++ = MI_FLUSH_DW;
+	*cs++ = 0;
+	*cs++ = 0;
+	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
+	rq->tail = intel_ring_offset(rq, cs);
+	assert_ring_tail_valid(rq->ring, rq->tail);
+}
+static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3;
+#undef GEN7_XCS_WA
 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
 {
 	/*
@@ -874,31 +902,6 @@ gen5_seqno_barrier(struct intel_engine_cs *engine)
 	usleep_range(125, 250);
 }
-static void
-gen6_seqno_barrier(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	/* Workaround to force correct ordering between irq and seqno writes on
-	 * ivb (and maybe also on snb) by reading from a CS register (like
-	 * ACTHD) before reading the status page.
-	 *
-	 * Note that this effectively stalls the read by the time it takes to
-	 * do a memory transaction, which more or less ensures that the write
-	 * from the GPU has sufficient time to invalidate the CPU cacheline.
-	 * Alternatively we could delay the interrupt from the CS ring to give
-	 * the write time to land, but that would incur a delay after every
-	 * batch i.e. much more frequent than a delay when waiting for the
-	 * interrupt (with the same net latency).
-	 *
-	 * Also note that to prevent whole machine hangs on gen7, we have to
-	 * take the spinlock to guard against concurrent cacheline access.
-	 */
-	spin_lock_irq(&dev_priv->uncore.lock);
-	POSTING_READ_FW(RING_ACTHD(engine->mmio_base));
-	spin_unlock_irq(&dev_priv->uncore.lock);
-}
 static void
 gen5_irq_enable(struct intel_engine_cs *engine)
 {
@@ -2258,10 +2261,13 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
 		engine->emit_flush = gen6_bsd_ring_flush;
 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
+		if (IS_GEN(dev_priv, 6)) {
 			engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
 			engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
-		if (!IS_GEN(dev_priv, 6))
+		} else {
-			engine->irq_seqno_barrier = gen6_seqno_barrier;
+			engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
+			engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
+		}
 	} else {
 		engine->emit_flush = bsd_ring_flush;
 		if (IS_GEN(dev_priv, 5))
@@ -2284,10 +2290,13 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
 	engine->emit_flush = gen6_ring_flush;
 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
+	if (IS_GEN(dev_priv, 6)) {
 		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
 		engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
-	if (!IS_GEN(dev_priv, 6))
+	} else {
-		engine->irq_seqno_barrier = gen6_seqno_barrier;
+		engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
+		engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
+	}
 	return intel_init_ring_buffer(engine);
 }
@@ -2305,9 +2314,8 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
 	engine->irq_enable = hsw_vebox_irq_enable;
 	engine->irq_disable = hsw_vebox_irq_disable;
-	engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
+	engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
-	engine->emit_breadcrumb_sz = gen6_xcs_emit_breadcrumb_sz;
+	engine->emit_breadcrumb_sz = gen7_xcs_emit_breadcrumb_sz;
-	engine->irq_seqno_barrier = gen6_seqno_barrier;
 	return intel_init_ring_buffer(engine);
 }