Commit 9f267eb8 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Stop the machine whilst capturing the GPU crash dump

The error state is purposefully racy as we expect it to be called at any
time and so have avoided any locking whilst capturing the crash dump.
However, with multi-engine GPUs and multiple CPUs, those races can
manifest into OOPSes as we attempt to chase dangling pointers freed on
other CPUs. Under discussion are lots of ways to slow down normal
operation in order to protect the post-mortem error capture, but what it
we take the opposite approach and freeze the machine whilst the error
capture runs (note the GPU may still running, but as long as we don't
process any of the results the driver's bookkeeping will be static).

Note that by of itself, this is not a complete fix. It also depends on
the compiler barriers in list_add/list_del to prevent traversing the
lists into the void. We also depend that we only require state from
carefully controlled sources - i.e. all the state we require for
post-mortem debugging should be reachable from the request itself so
that we only have to worry about retrieving the request carefully. Once
we have the request, we know that all pointers from it are intact.

v2: Avoid drm_clflush_pages() inside stop_machine() as it may use
stop_machine() itself for its wbinvd fallback.
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Acked-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/20161012090522.367-3-chris@chris-wilson.co.uk
parent 98a2f411
...@@ -4,6 +4,7 @@ config DRM_I915 ...@@ -4,6 +4,7 @@ config DRM_I915
depends on X86 && PCI depends on X86 && PCI
select INTEL_GTT select INTEL_GTT
select INTERVAL_TREE select INTERVAL_TREE
select STOP_MACHINE
# we need shmfs for the swappable backing store, and in particular # we need shmfs for the swappable backing store, and in particular
# the shmem_readpage() which depends upon tmpfs # the shmem_readpage() which depends upon tmpfs
select SHMEM select SHMEM
......
...@@ -746,6 +746,8 @@ struct drm_i915_error_state { ...@@ -746,6 +746,8 @@ struct drm_i915_error_state {
struct kref ref; struct kref ref;
struct timeval time; struct timeval time;
struct drm_i915_private *i915;
char error_msg[128]; char error_msg[128];
bool simulated; bool simulated;
int iommu; int iommu;
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
*/ */
#include <generated/utsrelease.h> #include <generated/utsrelease.h>
#include <linux/stop_machine.h>
#include "i915_drv.h" #include "i915_drv.h"
static const char *engine_str(int engine) static const char *engine_str(int engine)
...@@ -744,14 +745,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv, ...@@ -744,14 +745,12 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
dst->page_count = num_pages; dst->page_count = num_pages;
while (num_pages--) { while (num_pages--) {
unsigned long flags;
void *d; void *d;
d = kmalloc(PAGE_SIZE, GFP_ATOMIC); d = kmalloc(PAGE_SIZE, GFP_ATOMIC);
if (d == NULL) if (d == NULL)
goto unwind; goto unwind;
local_irq_save(flags);
if (use_ggtt) { if (use_ggtt) {
void __iomem *s; void __iomem *s;
...@@ -770,15 +769,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv, ...@@ -770,15 +769,10 @@ i915_error_object_create(struct drm_i915_private *dev_priv,
page = i915_gem_object_get_page(src, i); page = i915_gem_object_get_page(src, i);
drm_clflush_pages(&page, 1);
s = kmap_atomic(page); s = kmap_atomic(page);
memcpy(d, s, PAGE_SIZE); memcpy(d, s, PAGE_SIZE);
kunmap_atomic(s); kunmap_atomic(s);
drm_clflush_pages(&page, 1);
} }
local_irq_restore(flags);
dst->pages[i++] = d; dst->pages[i++] = d;
reloc_offset += PAGE_SIZE; reloc_offset += PAGE_SIZE;
...@@ -1447,6 +1441,31 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv, ...@@ -1447,6 +1441,31 @@ static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
sizeof(error->device_info)); sizeof(error->device_info));
} }
static int capture(void *data)
{
struct drm_i915_error_state *error = data;
/* Ensure that what we readback from memory matches what the GPU sees */
wbinvd();
i915_capture_gen_state(error->i915, error);
i915_capture_reg_state(error->i915, error);
i915_gem_record_fences(error->i915, error);
i915_gem_record_rings(error->i915, error);
i915_capture_active_buffers(error->i915, error);
i915_capture_pinned_buffers(error->i915, error);
do_gettimeofday(&error->time);
error->overlay = intel_overlay_capture_error_state(error->i915);
error->display = intel_display_capture_error_state(error->i915);
/* And make sure we don't leave trash in the CPU cache */
wbinvd();
return 0;
}
/** /**
* i915_capture_error_state - capture an error record for later analysis * i915_capture_error_state - capture an error record for later analysis
* @dev: drm device * @dev: drm device
...@@ -1478,18 +1497,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv, ...@@ -1478,18 +1497,9 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
} }
kref_init(&error->ref); kref_init(&error->ref);
error->i915 = dev_priv;
i915_capture_gen_state(dev_priv, error); stop_machine(capture, error, NULL);
i915_capture_reg_state(dev_priv, error);
i915_gem_record_fences(dev_priv, error);
i915_gem_record_rings(dev_priv, error);
i915_capture_active_buffers(dev_priv, error);
i915_capture_pinned_buffers(dev_priv, error);
do_gettimeofday(&error->time);
error->overlay = intel_overlay_capture_error_state(dev_priv);
error->display = intel_display_capture_error_state(dev_priv);
i915_error_capture_msg(dev_priv, error, engine_mask, error_msg); i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
DRM_INFO("%s\n", error->error_msg); DRM_INFO("%s\n", error->error_msg);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment