Commit 98a2f411 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Allow disabling error capture

We currently capture the GPU state after we detect a hang. This is vital
for us to both triage and debug hangs in the wild (post-mortem
debugging). However, it comes at the cost of running some potentially
dangerous code (since it has to make very few assumption about the state
of the driver) that is quite resource intensive.

This patch introduces both a method to disable error capture at runtime
(for users who hit bugs at runtime and need a workaround) and to disable
error capture at compiletime (for realtime users who want to minimise
any possible latency, and never require error capture, saving ~30k of
code). The cost is that we now have to be wary of (and test!) a kconfig
flag and a module parameter. The effect of the module parameter is easy
to verify through code inspection and runtime testing, but a kconfig flag
needs regular compile checking.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: default avatarJoonas Lahtinen <joonas.lahtinen@linux.intel.com>
Acked-by: default avatarJani Nikula <jani.nikula@linux.intel.com>
Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch
Link: http://patchwork.freedesktop.org/patch/msgid/20161012090522.367-2-chris@chris-wilson.co.uk
parent 0e704476
...@@ -46,6 +46,19 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT ...@@ -46,6 +46,19 @@ config DRM_I915_PRELIMINARY_HW_SUPPORT
If in doubt, say "N". If in doubt, say "N".
config DRM_I915_CAPTURE_ERROR
bool "Enable capturing GPU state following a hang"
depends on DRM_I915
default y
help
This option enables capturing the GPU state when a hang is detected.
This information is vital for triaging hangs and assists in debugging.
Please report any hang to
https://bugs.freedesktop.org/enter_bug.cgi?product=DRI
for triaging.
If in doubt, say "Y".
config DRM_I915_USERPTR config DRM_I915_USERPTR
bool "Always enable userptr support" bool "Always enable userptr support"
depends on DRM_I915 depends on DRM_I915
......
...@@ -42,7 +42,6 @@ i915-y += i915_cmd_parser.o \ ...@@ -42,7 +42,6 @@ i915-y += i915_cmd_parser.o \
i915_gem_stolen.o \ i915_gem_stolen.o \
i915_gem_tiling.o \ i915_gem_tiling.o \
i915_gem_userptr.o \ i915_gem_userptr.o \
i915_gpu_error.o \
i915_trace_points.o \ i915_trace_points.o \
intel_breadcrumbs.o \ intel_breadcrumbs.o \
intel_engine_cs.o \ intel_engine_cs.o \
...@@ -107,6 +106,9 @@ i915-y += dvo_ch7017.o \ ...@@ -107,6 +106,9 @@ i915-y += dvo_ch7017.o \
intel_sdvo.o \ intel_sdvo.o \
intel_tv.o intel_tv.o
# Post-mortem debug and GPU hang state capture
i915-$(CONFIG_DRM_I915_CAPTURE_ERROR) += i915_gpu_error.o
# virtual gpu code # virtual gpu code
i915-y += i915_vgpu.o i915-y += i915_vgpu.o
......
...@@ -960,6 +960,8 @@ static int i915_hws_info(struct seq_file *m, void *data) ...@@ -960,6 +960,8 @@ static int i915_hws_info(struct seq_file *m, void *data)
return 0; return 0;
} }
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
static ssize_t static ssize_t
i915_error_state_write(struct file *filp, i915_error_state_write(struct file *filp,
const char __user *ubuf, const char __user *ubuf,
...@@ -1042,6 +1044,8 @@ static const struct file_operations i915_error_state_fops = { ...@@ -1042,6 +1044,8 @@ static const struct file_operations i915_error_state_fops = {
.release = i915_error_state_release, .release = i915_error_state_release,
}; };
#endif
static int static int
i915_next_seqno_get(void *data, u64 *val) i915_next_seqno_get(void *data, u64 *val)
{ {
...@@ -5398,7 +5402,9 @@ static const struct i915_debugfs_files { ...@@ -5398,7 +5402,9 @@ static const struct i915_debugfs_files {
{"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
{"i915_ring_test_irq", &i915_ring_test_irq_fops}, {"i915_ring_test_irq", &i915_ring_test_irq_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops},
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops}, {"i915_error_state", &i915_error_state_fops},
#endif
{"i915_next_seqno", &i915_next_seqno_fops}, {"i915_next_seqno", &i915_next_seqno_fops},
{"i915_display_crc_ctl", &i915_display_crc_ctl_fops}, {"i915_display_crc_ctl", &i915_display_crc_ctl_fops},
{"i915_pri_wm_latency", &i915_pri_wm_latency_fops}, {"i915_pri_wm_latency", &i915_pri_wm_latency_fops},
......
...@@ -3544,6 +3544,8 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {} ...@@ -3544,6 +3544,8 @@ static inline void intel_display_crc_init(struct drm_i915_private *dev_priv) {}
#endif #endif
/* i915_gpu_error.c */ /* i915_gpu_error.c */
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
__printf(2, 3) __printf(2, 3)
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...); void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
int i915_error_state_to_str(struct drm_i915_error_state_buf *estr, int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
...@@ -3564,6 +3566,20 @@ void i915_error_state_get(struct drm_device *dev, ...@@ -3564,6 +3566,20 @@ void i915_error_state_get(struct drm_device *dev,
void i915_error_state_put(struct i915_error_state_file_priv *error_priv); void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
void i915_destroy_error_state(struct drm_device *dev); void i915_destroy_error_state(struct drm_device *dev);
#else
static inline void i915_capture_error_state(struct drm_i915_private *dev_priv,
u32 engine_mask,
const char *error_msg)
{
}
static inline void i915_destroy_error_state(struct drm_device *dev)
{
}
#endif
const char *i915_cache_level_str(struct drm_i915_private *i915, int type); const char *i915_cache_level_str(struct drm_i915_private *i915, int type);
/* i915_cmd_parser.c */ /* i915_cmd_parser.c */
......
...@@ -1464,6 +1464,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, ...@@ -1464,6 +1464,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
struct drm_i915_error_state *error; struct drm_i915_error_state *error;
unsigned long flags; unsigned long flags;
if (!i915.error_capture)
return;
if (READ_ONCE(dev_priv->gpu_error.first_error)) if (READ_ONCE(dev_priv->gpu_error.first_error))
return; return;
......
...@@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = { ...@@ -47,6 +47,7 @@ struct i915_params i915 __read_mostly = {
.load_detect_test = 0, .load_detect_test = 0,
.force_reset_modeset_test = 0, .force_reset_modeset_test = 0,
.reset = true, .reset = true,
.error_capture = true,
.invert_brightness = 0, .invert_brightness = 0,
.disable_display = 0, .disable_display = 0,
.enable_cmd_parser = 1, .enable_cmd_parser = 1,
...@@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type, ...@@ -115,6 +116,14 @@ MODULE_PARM_DESC(vbt_sdvo_panel_type,
module_param_named_unsafe(reset, i915.reset, bool, 0600); module_param_named_unsafe(reset, i915.reset, bool, 0600);
MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)"); MODULE_PARM_DESC(reset, "Attempt GPU resets (default: true)");
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
module_param_named(error_capture, i915.error_capture, bool, 0600);
MODULE_PARM_DESC(error_capture,
"Record the GPU state following a hang. "
"This information in /sys/class/drm/card<N>/error is vital for "
"triaging and debugging hangs.");
#endif
module_param_named_unsafe(enable_hangcheck, i915.enable_hangcheck, bool, 0644); module_param_named_unsafe(enable_hangcheck, i915.enable_hangcheck, bool, 0644);
MODULE_PARM_DESC(enable_hangcheck, MODULE_PARM_DESC(enable_hangcheck,
"Periodically check GPU activity for detecting hangs. " "Periodically check GPU activity for detecting hangs. "
......
...@@ -59,6 +59,7 @@ struct i915_params { ...@@ -59,6 +59,7 @@ struct i915_params {
bool load_detect_test; bool load_detect_test;
bool force_reset_modeset_test; bool force_reset_modeset_test;
bool reset; bool reset;
bool error_capture;
bool disable_display; bool disable_display;
bool verbose_state_checks; bool verbose_state_checks;
bool nuclear_pageflip; bool nuclear_pageflip;
......
...@@ -514,6 +514,8 @@ static const struct attribute *vlv_attrs[] = { ...@@ -514,6 +514,8 @@ static const struct attribute *vlv_attrs[] = {
NULL, NULL,
}; };
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
static ssize_t error_state_read(struct file *filp, struct kobject *kobj, static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *attr, char *buf, struct bin_attribute *attr, char *buf,
loff_t off, size_t count) loff_t off, size_t count)
...@@ -571,6 +573,21 @@ static struct bin_attribute error_state_attr = { ...@@ -571,6 +573,21 @@ static struct bin_attribute error_state_attr = {
.write = error_state_write, .write = error_state_write,
}; };
static void i915_setup_error_capture(struct device *kdev)
{
if (sysfs_create_bin_file(&kdev->kobj, &error_state_attr))
DRM_ERROR("error_state sysfs setup failed\n");
}
static void i915_teardown_error_capture(struct device *kdev)
{
sysfs_remove_bin_file(&kdev->kobj, &error_state_attr);
}
#else
static void i915_setup_error_capture(struct device *kdev) {}
static void i915_teardown_error_capture(struct device *kdev) {}
#endif
void i915_setup_sysfs(struct drm_i915_private *dev_priv) void i915_setup_sysfs(struct drm_i915_private *dev_priv)
{ {
struct device *kdev = dev_priv->drm.primary->kdev; struct device *kdev = dev_priv->drm.primary->kdev;
...@@ -617,17 +634,15 @@ void i915_setup_sysfs(struct drm_i915_private *dev_priv) ...@@ -617,17 +634,15 @@ void i915_setup_sysfs(struct drm_i915_private *dev_priv)
if (ret) if (ret)
DRM_ERROR("RPS sysfs setup failed\n"); DRM_ERROR("RPS sysfs setup failed\n");
ret = sysfs_create_bin_file(&kdev->kobj, i915_setup_error_capture(kdev);
&error_state_attr);
if (ret)
DRM_ERROR("error_state sysfs setup failed\n");
} }
void i915_teardown_sysfs(struct drm_i915_private *dev_priv) void i915_teardown_sysfs(struct drm_i915_private *dev_priv)
{ {
struct device *kdev = dev_priv->drm.primary->kdev; struct device *kdev = dev_priv->drm.primary->kdev;
sysfs_remove_bin_file(&kdev->kobj, &error_state_attr); i915_teardown_error_capture(kdev);
if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv))
sysfs_remove_files(&kdev->kobj, vlv_attrs); sysfs_remove_files(&kdev->kobj, vlv_attrs);
else else
......
...@@ -17097,6 +17097,8 @@ int intel_modeset_vga_set_state(struct drm_device *dev, bool state) ...@@ -17097,6 +17097,8 @@ int intel_modeset_vga_set_state(struct drm_device *dev, bool state)
return 0; return 0;
} }
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
struct intel_display_error_state { struct intel_display_error_state {
u32 power_well_driver; u32 power_well_driver;
...@@ -17279,3 +17281,5 @@ intel_display_print_error_state(struct drm_i915_error_state_buf *m, ...@@ -17279,3 +17281,5 @@ intel_display_print_error_state(struct drm_i915_error_state_buf *m,
err_printf(m, " VSYNC: %08x\n", error->transcoder[i].vsync); err_printf(m, " VSYNC: %08x\n", error->transcoder[i].vsync);
} }
} }
#endif
...@@ -1470,6 +1470,8 @@ void intel_cleanup_overlay(struct drm_i915_private *dev_priv) ...@@ -1470,6 +1470,8 @@ void intel_cleanup_overlay(struct drm_i915_private *dev_priv)
kfree(dev_priv->overlay); kfree(dev_priv->overlay);
} }
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
struct intel_overlay_error_state { struct intel_overlay_error_state {
struct overlay_registers regs; struct overlay_registers regs;
unsigned long base; unsigned long base;
...@@ -1587,3 +1589,5 @@ intel_overlay_print_error_state(struct drm_i915_error_state_buf *m, ...@@ -1587,3 +1589,5 @@ intel_overlay_print_error_state(struct drm_i915_error_state_buf *m,
P(UVSCALEV); P(UVSCALEV);
#undef P #undef P
} }
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment