Commit bda30024 authored by Tvrtko Ursulin's avatar Tvrtko Ursulin

drm/i915: Improve record of hung engines in error state

Between events which trigger engine and GPU resets and capturing the error
state we lose information on which engine triggered the reset. Improve
this by passing in the hung engine mask down to error capture.

Result is that the list of engines in user visible "GPU HANG: ecode
<gen>:<engines>:<ecode>, <process>" is now a list of hanging and not just
active engines. Most importantly the displayed process is now the one
which was actually hung.

v2:
 * Stub prototype. (Chris)
Signed-off-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20201104134743.916027-1-tvrtko.ursulin@linux.intel.com
parent ad18fa0f
......@@ -3037,6 +3037,8 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
if (!cap->error->gt->engine)
goto err_gt;
cap->error->gt->engine->hung = true;
return cap;
err_gt:
......
......@@ -1251,7 +1251,7 @@ void intel_gt_handle_error(struct intel_gt *gt,
engine_mask &= gt->info.engine_mask;
if (flags & I915_ERROR_CAPTURE) {
i915_capture_error_state(gt->i915);
i915_capture_error_state(gt, engine_mask);
intel_gt_clear_error_registers(gt, engine_mask);
}
......
......@@ -725,7 +725,7 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)
gpu = NULL;
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
gpu = i915_gpu_coredump(i915);
gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES);
if (IS_ERR(gpu))
return PTR_ERR(gpu);
......
......@@ -570,6 +570,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
ee->vm_info.pp_dir_base);
}
}
err_printf(m, " hung: %u\n", ee->hung);
err_printf(m, " engine reset count: %u\n", ee->reset_count);
for (n = 0; n < ee->num_ports; n++) {
......@@ -1456,6 +1457,7 @@ capture_engine(struct intel_engine_cs *engine,
static void
gt_record_engines(struct intel_gt_coredump *gt,
intel_engine_mask_t engine_mask,
struct i915_vma_compress *compress)
{
struct intel_engine_cs *engine;
......@@ -1471,6 +1473,8 @@ gt_record_engines(struct intel_gt_coredump *gt,
if (!ee)
continue;
ee->hung = engine->mask & engine_mask;
gt->simulated |= ee->simulated;
if (ee->simulated) {
kfree(ee);
......@@ -1663,11 +1667,13 @@ static const char *error_msg(struct i915_gpu_coredump *error)
for (gt = error->gt; gt; gt = gt->next) {
struct intel_engine_coredump *cs;
if (gt->engine && !first)
first = gt->engine;
for (cs = gt->engine; cs; cs = cs->next)
for (cs = gt->engine; cs; cs = cs->next) {
if (cs->hung) {
engines |= cs->engine->mask;
if (!first)
first = cs;
}
}
}
len = scnprintf(error->error_msg, sizeof(error->error_msg),
......@@ -1781,8 +1787,10 @@ void i915_vma_capture_finish(struct intel_gt_coredump *gt,
kfree(compress);
}
struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
struct i915_gpu_coredump *
i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask)
{
struct drm_i915_private *i915 = gt->i915;
struct i915_gpu_coredump *error;
/* Check if GPU capture has been disabled */
......@@ -1794,7 +1802,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
if (!error)
return ERR_PTR(-ENOMEM);
error->gt = intel_gt_coredump_alloc(&i915->gt, ALLOW_FAIL);
error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL);
if (error->gt) {
struct i915_vma_compress *compress;
......@@ -1806,7 +1814,7 @@ struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915)
}
gt_record_info(error->gt);
gt_record_engines(error->gt, compress);
gt_record_engines(error->gt, engine_mask, compress);
if (INTEL_INFO(i915)->has_gt_uc)
error->gt->uc = gt_record_uc(error->gt, compress);
......@@ -1853,20 +1861,23 @@ void i915_error_state_store(struct i915_gpu_coredump *error)
/**
* i915_capture_error_state - capture an error record for later analysis
* @i915: i915 device
* @gt: intel_gt which originated the hang
* @engine_mask: hung engines
*
*
* Should be called when an error is detected (either a hang or an error
* interrupt) to capture error state from the time of the error. Fills
* out a structure which becomes available in debugfs for user level tools
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *i915)
void i915_capture_error_state(struct intel_gt *gt,
intel_engine_mask_t engine_mask)
{
struct i915_gpu_coredump *error;
error = i915_gpu_coredump(i915);
error = i915_gpu_coredump(gt, engine_mask);
if (IS_ERR(error)) {
cmpxchg(&i915->gpu_error.first_error, NULL, error);
cmpxchg(&gt->i915->gpu_error.first_error, NULL, error);
return;
}
......
......@@ -59,6 +59,7 @@ struct i915_request_coredump {
struct intel_engine_coredump {
const struct intel_engine_cs *engine;
bool hung;
bool simulated;
u32 reset_count;
......@@ -218,8 +219,10 @@ struct drm_i915_error_state_buf {
__printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
struct i915_gpu_coredump *i915_gpu_coredump(struct drm_i915_private *i915);
void i915_capture_error_state(struct drm_i915_private *i915);
struct i915_gpu_coredump *i915_gpu_coredump(struct intel_gt *gt,
intel_engine_mask_t engine_mask);
void i915_capture_error_state(struct intel_gt *gt,
intel_engine_mask_t engine_mask);
struct i915_gpu_coredump *
i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp);
......@@ -271,7 +274,8 @@ void i915_disable_error_state(struct drm_i915_private *i915, int err);
#else
static inline void i915_capture_error_state(struct drm_i915_private *i915)
static inline void
i915_capture_error_state(struct intel_gt *gt, intel_engine_mask_t engine_mask)
{
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment