Commit b6b0fac0 authored by Mika Kuoppala's avatar Mika Kuoppala Committed by Daniel Vetter

drm/i915: Use hangcheck score to find guilty context

With full ppgtt using acthd is not enough to find guilty
batch buffer. We get multiple false positives as acthd is
per vm.

Instead of scanning which vm was running on a ring,
to find corressponding context, use a different, simpler,
strategy of finding batches that caused gpu hang:

If hangcheck has declared ring to be hung, find first non complete
request on that ring and claim it was guilty.

v2: Rebase

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=73652Suggested-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarMika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net> (v1)
Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
parent 94825369
...@@ -2332,9 +2332,10 @@ static bool i915_context_is_banned(struct drm_device *dev, ...@@ -2332,9 +2332,10 @@ static bool i915_context_is_banned(struct drm_device *dev,
static void i915_set_reset_status(struct intel_ring_buffer *ring, static void i915_set_reset_status(struct intel_ring_buffer *ring,
struct drm_i915_gem_request *request, struct drm_i915_gem_request *request,
u32 acthd) const bool guilty)
{ {
bool inside, guilty; const u32 acthd = intel_ring_get_active_head(ring);
bool inside;
unsigned long offset = 0; unsigned long offset = 0;
struct i915_hw_context *ctx = request->ctx; struct i915_hw_context *ctx = request->ctx;
struct i915_ctx_hang_stats *hs; struct i915_ctx_hang_stats *hs;
...@@ -2342,14 +2343,11 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring, ...@@ -2342,14 +2343,11 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
if (WARN_ON(!ctx)) if (WARN_ON(!ctx))
return; return;
/* Innocent until proven guilty */
guilty = false;
if (request->batch_obj) if (request->batch_obj)
offset = i915_gem_obj_offset(request->batch_obj, offset = i915_gem_obj_offset(request->batch_obj,
request_to_vm(request)); request_to_vm(request));
if (ring->hangcheck.action != HANGCHECK_WAIT && if (guilty &&
i915_request_guilty(request, acthd, &inside)) { i915_request_guilty(request, acthd, &inside)) {
DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n", DRM_DEBUG("%s hung %s bo (0x%lx ctx %d) at 0x%x\n",
ring->name, ring->name,
...@@ -2357,8 +2355,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring, ...@@ -2357,8 +2355,6 @@ static void i915_set_reset_status(struct intel_ring_buffer *ring,
offset, offset,
ctx->id, ctx->id,
acthd); acthd);
guilty = true;
} }
WARN_ON(!ctx->last_ring); WARN_ON(!ctx->last_ring);
...@@ -2385,19 +2381,39 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request) ...@@ -2385,19 +2381,39 @@ static void i915_gem_free_request(struct drm_i915_gem_request *request)
kfree(request); kfree(request);
} }
static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv, static struct drm_i915_gem_request *
struct intel_ring_buffer *ring) i915_gem_find_first_non_complete(struct intel_ring_buffer *ring)
{ {
u32 completed_seqno = ring->get_seqno(ring, false);
u32 acthd = intel_ring_get_active_head(ring);
struct drm_i915_gem_request *request; struct drm_i915_gem_request *request;
const u32 completed_seqno = ring->get_seqno(ring, false);
list_for_each_entry(request, &ring->request_list, list) { list_for_each_entry(request, &ring->request_list, list) {
if (i915_seqno_passed(completed_seqno, request->seqno)) if (i915_seqno_passed(completed_seqno, request->seqno))
continue; continue;
i915_set_reset_status(ring, request, acthd); return request;
} }
return NULL;
}
static void i915_gem_reset_ring_status(struct drm_i915_private *dev_priv,
struct intel_ring_buffer *ring)
{
struct drm_i915_gem_request *request;
bool ring_hung;
request = i915_gem_find_first_non_complete(ring);
if (request == NULL)
return;
ring_hung = ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG;
i915_set_reset_status(ring, request, ring_hung);
list_for_each_entry_continue(request, &ring->request_list, list)
i915_set_reset_status(ring, request, false);
} }
static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv, static void i915_gem_reset_ring_cleanup(struct drm_i915_private *dev_priv,
......
...@@ -2532,7 +2532,6 @@ static void i915_hangcheck_elapsed(unsigned long data) ...@@ -2532,7 +2532,6 @@ static void i915_hangcheck_elapsed(unsigned long data)
#define BUSY 1 #define BUSY 1
#define KICK 5 #define KICK 5
#define HUNG 20 #define HUNG 20
#define FIRE 30
if (!i915.enable_hangcheck) if (!i915.enable_hangcheck)
return; return;
...@@ -2616,7 +2615,7 @@ static void i915_hangcheck_elapsed(unsigned long data) ...@@ -2616,7 +2615,7 @@ static void i915_hangcheck_elapsed(unsigned long data)
} }
for_each_ring(ring, dev_priv, i) { for_each_ring(ring, dev_priv, i) {
if (ring->hangcheck.score > FIRE) { if (ring->hangcheck.score >= HANGCHECK_SCORE_RING_HUNG) {
DRM_INFO("%s on %s\n", DRM_INFO("%s on %s\n",
stuck[i] ? "stuck" : "no progress", stuck[i] ? "stuck" : "no progress",
ring->name); ring->name);
......
...@@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action { ...@@ -41,6 +41,8 @@ enum intel_ring_hangcheck_action {
HANGCHECK_HUNG, HANGCHECK_HUNG,
}; };
#define HANGCHECK_SCORE_RING_HUNG 31
struct intel_ring_hangcheck { struct intel_ring_hangcheck {
bool deadlock; bool deadlock;
u32 seqno; u32 seqno;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment