selftest_hangcheck.c 45.5 KB
Newer Older
Chris Wilson's avatar
Chris Wilson committed
1
// SPDX-License-Identifier: MIT
2 3 4 5
/*
 * Copyright © 2016 Intel Corporation
 */

6 7
#include <linux/kthread.h>

8
#include "gem/i915_gem_context.h"
9
#include "gem/i915_gem_internal.h"
10

11
#include "i915_gem_evict.h"
12 13
#include "intel_gt.h"
#include "intel_engine_heartbeat.h"
14
#include "intel_engine_pm.h"
15
#include "selftest_engine_heartbeat.h"
16

17 18 19 20
#include "i915_selftest.h"
#include "selftests/i915_random.h"
#include "selftests/igt_flush_test.h"
#include "selftests/igt_reset.h"
21
#include "selftests/igt_atomic.h"
22 23
#include "selftests/igt_spinner.h"
#include "selftests/intel_scheduler_helpers.h"
24

25
#include "selftests/mock_drm.h"
26

27 28 29
#include "gem/selftests/mock_context.h"
#include "gem/selftests/igt_gem_utils.h"

30 31
#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */

32
struct hang {
33
	struct intel_gt *gt;
34 35
	struct drm_i915_gem_object *hws;
	struct drm_i915_gem_object *obj;
36
	struct i915_gem_context *ctx;
37 38 39 40
	u32 *seqno;
	u32 *batch;
};

41
static int hang_init(struct hang *h, struct intel_gt *gt)
42 43 44 45 46
{
	void *vaddr;
	int err;

	memset(h, 0, sizeof(*h));
47
	h->gt = gt;
48

49
	h->ctx = kernel_context(gt->i915, NULL);
50 51 52
	if (IS_ERR(h->ctx))
		return PTR_ERR(h->ctx);

53 54
	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));

55
	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56 57 58 59
	if (IS_ERR(h->hws)) {
		err = PTR_ERR(h->hws);
		goto err_ctx;
	}
60

61
	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62 63 64 65 66
	if (IS_ERR(h->obj)) {
		err = PTR_ERR(h->obj);
		goto err_hws;
	}

67
	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
68
	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69 70 71 72 73 74
	if (IS_ERR(vaddr)) {
		err = PTR_ERR(vaddr);
		goto err_obj;
	}
	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);

75
	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
76
						 i915_coherent_map_type(gt->i915, h->obj, false));
77 78 79 80 81 82 83 84 85 86 87 88 89 90
	if (IS_ERR(vaddr)) {
		err = PTR_ERR(vaddr);
		goto err_unpin_hws;
	}
	h->batch = vaddr;

	return 0;

err_unpin_hws:
	i915_gem_object_unpin_map(h->hws);
err_obj:
	i915_gem_object_put(h->obj);
err_hws:
	i915_gem_object_put(h->hws);
91 92
err_ctx:
	kernel_context_close(h->ctx);
93 94 95 96
	return err;
}

static u64 hws_address(const struct i915_vma *hws,
97
		       const struct i915_request *rq)
98 99 100 101
{
	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
}

102 103
static struct i915_request *
hang_create_request(struct hang *h, struct intel_engine_cs *engine)
104
{
105
	struct intel_gt *gt = h->gt;
106
	struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
107
	struct drm_i915_gem_object *obj;
108
	struct i915_request *rq = NULL;
109 110
	struct i915_vma *hws, *vma;
	unsigned int flags;
111
	void *vaddr;
112 113 114
	u32 *batch;
	int err;

115
	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
116 117
	if (IS_ERR(obj)) {
		i915_vm_put(vm);
118
		return ERR_CAST(obj);
119
	}
120

121
	vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
122 123
	if (IS_ERR(vaddr)) {
		i915_gem_object_put(obj);
124
		i915_vm_put(vm);
125 126
		return ERR_CAST(vaddr);
	}
127

128 129
	i915_gem_object_unpin_map(h->obj);
	i915_gem_object_put(h->obj);
130

131 132
	h->obj = obj;
	h->batch = vaddr;
133

134
	vma = i915_vma_instance(h->obj, vm, NULL);
135 136
	if (IS_ERR(vma)) {
		i915_vm_put(vm);
137
		return ERR_CAST(vma);
138
	}
139 140

	hws = i915_vma_instance(h->hws, vm, NULL);
141 142
	if (IS_ERR(hws)) {
		i915_vm_put(vm);
143
		return ERR_CAST(hws);
144
	}
145 146

	err = i915_vma_pin(vma, 0, 0, PIN_USER);
147 148
	if (err) {
		i915_vm_put(vm);
149
		return ERR_PTR(err);
150
	}
151 152 153 154 155

	err = i915_vma_pin(hws, 0, 0, PIN_USER);
	if (err)
		goto unpin_vma;

156
	rq = igt_request_alloc(h->ctx, engine);
157 158
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
159
		goto unpin_hws;
160 161
	}

162
	err = igt_vma_move_to_active_unlocked(vma, rq, 0);
163
	if (err)
164
		goto cancel_rq;
165

166
	err = igt_vma_move_to_active_unlocked(hws, rq, 0);
167 168
	if (err)
		goto cancel_rq;
169 170

	batch = h->batch;
171
	if (GRAPHICS_VER(gt->i915) >= 8) {
172 173 174 175
		*batch++ = MI_STORE_DWORD_IMM_GEN4;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = upper_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
176
		*batch++ = MI_NOOP;
177 178 179 180

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

181
		*batch++ = MI_NOOP;
182 183 184
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
		*batch++ = lower_32_bits(vma->node.start);
		*batch++ = upper_32_bits(vma->node.start);
185
	} else if (GRAPHICS_VER(gt->i915) >= 6) {
186 187 188 189
		*batch++ = MI_STORE_DWORD_IMM_GEN4;
		*batch++ = 0;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
190
		*batch++ = MI_NOOP;
191 192 193 194

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

195
		*batch++ = MI_NOOP;
196 197
		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
		*batch++ = lower_32_bits(vma->node.start);
198
	} else if (GRAPHICS_VER(gt->i915) >= 4) {
199
		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
200 201 202
		*batch++ = 0;
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
203
		*batch++ = MI_NOOP;
204 205 206 207

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

208
		*batch++ = MI_NOOP;
209 210 211
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
		*batch++ = lower_32_bits(vma->node.start);
	} else {
212
		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
213 214
		*batch++ = lower_32_bits(hws_address(hws, rq));
		*batch++ = rq->fence.seqno;
215
		*batch++ = MI_NOOP;
216 217 218 219

		memset(batch, 0, 1024);
		batch += 1024 / sizeof(*batch);

220
		*batch++ = MI_NOOP;
221
		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
222 223 224
		*batch++ = lower_32_bits(vma->node.start);
	}
	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
225
	intel_gt_chipset_flush(engine->gt);
226

227 228 229 230 231 232
	if (rq->engine->emit_init_breadcrumb) {
		err = rq->engine->emit_init_breadcrumb(rq);
		if (err)
			goto cancel_rq;
	}

233
	flags = 0;
234
	if (GRAPHICS_VER(gt->i915) <= 5)
235 236 237 238
		flags |= I915_DISPATCH_SECURE;

	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);

239 240
cancel_rq:
	if (err) {
241
		i915_request_set_error_once(rq, err);
242 243
		i915_request_add(rq);
	}
244
unpin_hws:
245 246 247
	i915_vma_unpin(hws);
unpin_vma:
	i915_vma_unpin(vma);
248
	i915_vm_put(vm);
249
	return err ? ERR_PTR(err) : rq;
250 251
}

252
static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
253 254 255 256 257 258 259
{
	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
}

static void hang_fini(struct hang *h)
{
	*h->batch = MI_BATCH_BUFFER_END;
260
	intel_gt_chipset_flush(h->gt);
261 262 263 264 265 266 267

	i915_gem_object_unpin_map(h->obj);
	i915_gem_object_put(h->obj);

	i915_gem_object_unpin_map(h->hws);
	i915_gem_object_put(h->hws);

268 269
	kernel_context_close(h->ctx);

270
	igt_flush_test(h->gt->i915);
271 272
}

273
static bool wait_until_running(struct hang *h, struct i915_request *rq)
274 275 276 277 278 279 280 281 282
{
	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
					       rq->fence.seqno),
			     10) &&
		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
					    rq->fence.seqno),
			  1000));
}

283 284
static int igt_hang_sanitycheck(void *arg)
{
285
	struct intel_gt *gt = arg;
286
	struct i915_request *rq;
287 288 289 290 291 292 293
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	struct hang h;
	int err;

	/* Basic check that we can execute our hanging batch */

294
	err = hang_init(&h, gt);
295
	if (err)
296
		return err;
297

298
	for_each_engine(engine, gt, id) {
299
		struct intel_wedge_me w;
300 301
		long timeout;

302 303 304
		if (!intel_engine_can_store_dword(engine))
			continue;

305
		rq = hang_create_request(&h, engine);
306 307 308 309 310 311 312
		if (IS_ERR(rq)) {
			err = PTR_ERR(rq);
			pr_err("Failed to create request for %s, err=%d\n",
			       engine->name, err);
			goto fini;
		}

313
		i915_request_get(rq);
314 315

		*h.batch = MI_BATCH_BUFFER_END;
316
		intel_gt_chipset_flush(engine->gt);
317

318
		i915_request_add(rq);
319

320
		timeout = 0;
321
		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
322
			timeout = i915_request_wait(rq, 0,
323
						    MAX_SCHEDULE_TIMEOUT);
324
		if (intel_gt_is_wedged(gt))
325 326
			timeout = -EIO;

327
		i915_request_put(rq);
328 329 330 331 332 333 334 335 336 337 338 339 340 341

		if (timeout < 0) {
			err = timeout;
			pr_err("Wait for request failed on %s, err=%d\n",
			       engine->name, err);
			goto fini;
		}
	}

fini:
	hang_fini(&h);
	return err;
}

342 343 344 345 346
static bool wait_for_idle(struct intel_engine_cs *engine)
{
	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
}

347 348
static int igt_reset_nop(void *arg)
{
349 350
	struct intel_gt *gt = arg;
	struct i915_gpu_error *global = &gt->i915->gpu_error;
351 352 353 354 355 356 357 358
	struct intel_engine_cs *engine;
	unsigned int reset_count, count;
	enum intel_engine_id id;
	IGT_TIMEOUT(end_time);
	int err = 0;

	/* Check that we can reset during non-user portions of requests */

359
	reset_count = i915_reset_count(global);
360 361
	count = 0;
	do {
362
		for_each_engine(engine, gt, id) {
363
			struct intel_context *ce;
364 365
			int i;

366 367 368
			ce = intel_context_create(engine);
			if (IS_ERR(ce)) {
				err = PTR_ERR(ce);
369
				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
370 371 372
				break;
			}

373 374 375
			for (i = 0; i < 16; i++) {
				struct i915_request *rq;

376
				rq = intel_context_create_request(ce);
377 378
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
379 380
					pr_err("[%s] Create request failed: %d!\n",
					       engine->name, err);
381 382 383 384 385
					break;
				}

				i915_request_add(rq);
			}
386 387

			intel_context_put(ce);
388 389
		}

390 391 392
		igt_global_reset_lock(gt);
		intel_gt_reset(gt, ALL_ENGINES, NULL);
		igt_global_reset_unlock(gt);
393

394
		if (intel_gt_is_wedged(gt)) {
395
			pr_err("[%s] GT is wedged!\n", engine->name);
396 397 398 399
			err = -EIO;
			break;
		}

400
		if (i915_reset_count(global) != reset_count + ++count) {
401 402
			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
			       engine->name, i915_reset_count(global), reset_count, count);
403 404 405 406
			err = -EINVAL;
			break;
		}

407
		err = igt_flush_test(gt->i915);
408 409
		if (err) {
			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
410
			break;
411
		}
412 413 414
	} while (time_before(jiffies, end_time));
	pr_info("%s: %d resets\n", __func__, count);

415 416
	if (igt_flush_test(gt->i915)) {
		pr_err("Post flush failed: %d!\n", err);
417
		err = -EIO;
418 419
	}

420 421 422 423 424
	return err;
}

static int igt_reset_nop_engine(void *arg)
{
425 426
	struct intel_gt *gt = arg;
	struct i915_gpu_error *global = &gt->i915->gpu_error;
427 428 429 430 431
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

	/* Check that we can engine-reset during non-user portions */

432
	if (!intel_has_reset_engine(gt))
433 434
		return 0;

435
	for_each_engine(engine, gt, id) {
436 437
		unsigned int reset_count, reset_engine_count, count;
		struct intel_context *ce;
438
		IGT_TIMEOUT(end_time);
439 440
		int err;

441 442 443 444 445 446 447 448
		if (intel_engine_uses_guc(engine)) {
			/* Engine level resets are triggered by GuC when a hang
			 * is detected. They can't be triggered by the KMD any
			 * more. Thus a nop batch cannot be used as a reset test
			 */
			continue;
		}

449
		ce = intel_context_create(engine);
450
		if (IS_ERR(ce)) {
451
			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
452
			return PTR_ERR(ce);
453
		}
454

455 456
		reset_count = i915_reset_count(global);
		reset_engine_count = i915_reset_engine_count(global, engine);
457 458
		count = 0;

459
		st_engine_heartbeat_disable(engine);
460 461
		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
					    &gt->reset.flags));
462 463 464 465 466 467 468 469 470 471 472 473 474
		do {
			int i;

			if (!wait_for_idle(engine)) {
				pr_err("%s failed to idle before reset\n",
				       engine->name);
				err = -EIO;
				break;
			}

			for (i = 0; i < 16; i++) {
				struct i915_request *rq;

475
				rq = intel_context_create_request(ce);
476
				if (IS_ERR(rq)) {
477 478 479 480 481 482 483 484 485 486 487 488 489 490
					struct drm_printer p =
						drm_info_printer(gt->i915->drm.dev);
					intel_engine_dump(engine, &p,
							  "%s(%s): failed to submit request\n",
							  __func__,
							  engine->name);

					GEM_TRACE("%s(%s): failed to submit request\n",
						  __func__,
						  engine->name);
					GEM_TRACE_DUMP();

					intel_gt_set_wedged(gt);

491 492 493 494 495 496
					err = PTR_ERR(rq);
					break;
				}

				i915_request_add(rq);
			}
497
			err = intel_engine_reset(engine, NULL);
498
			if (err) {
499 500
				pr_err("intel_engine_reset(%s) failed, err:%d\n",
				       engine->name, err);
501 502 503
				break;
			}

504
			if (i915_reset_count(global) != reset_count) {
505 506 507 508 509
				pr_err("Full GPU reset recorded! (engine reset expected)\n");
				err = -EINVAL;
				break;
			}

510
			if (i915_reset_engine_count(global, engine) !=
511 512 513 514 515 516 517
			    reset_engine_count + ++count) {
				pr_err("%s engine reset not recorded!\n",
				       engine->name);
				err = -EINVAL;
				break;
			}
		} while (time_before(jiffies, end_time));
518
		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
519
		st_engine_heartbeat_enable(engine);
520

521
		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
522

523 524 525
		intel_context_put(ce);
		if (igt_flush_test(gt->i915))
			err = -EIO;
526
		if (err)
527
			return err;
528 529
	}

530
	return 0;
531 532
}

533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560
static void force_reset_timeout(struct intel_engine_cs *engine)
{
	engine->reset_timeout.probability = 999;
	atomic_set(&engine->reset_timeout.times, -1);
}

static void cancel_reset_timeout(struct intel_engine_cs *engine)
{
	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
}

static int igt_reset_fail_engine(void *arg)
{
	struct intel_gt *gt = arg;
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

	/* Check that we can recover from engine-reset failues */

	if (!intel_has_reset_engine(gt))
		return 0;

	for_each_engine(engine, gt, id) {
		unsigned int count;
		struct intel_context *ce;
		IGT_TIMEOUT(end_time);
		int err;

561 562 563 564
		/* Can't manually break the reset if i915 doesn't perform it */
		if (intel_engine_uses_guc(engine))
			continue;

565
		ce = intel_context_create(engine);
566
		if (IS_ERR(ce)) {
567
			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
568
			return PTR_ERR(ce);
569
		}
570 571

		st_engine_heartbeat_disable(engine);
572 573
		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
					    &gt->reset.flags));
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669

		force_reset_timeout(engine);
		err = intel_engine_reset(engine, NULL);
		cancel_reset_timeout(engine);
		if (err == 0) /* timeouts only generated on gen8+ */
			goto skip;

		count = 0;
		do {
			struct i915_request *last = NULL;
			int i;

			if (!wait_for_idle(engine)) {
				pr_err("%s failed to idle before reset\n",
				       engine->name);
				err = -EIO;
				break;
			}

			for (i = 0; i < count % 15; i++) {
				struct i915_request *rq;

				rq = intel_context_create_request(ce);
				if (IS_ERR(rq)) {
					struct drm_printer p =
						drm_info_printer(gt->i915->drm.dev);
					intel_engine_dump(engine, &p,
							  "%s(%s): failed to submit request\n",
							  __func__,
							  engine->name);

					GEM_TRACE("%s(%s): failed to submit request\n",
						  __func__,
						  engine->name);
					GEM_TRACE_DUMP();

					intel_gt_set_wedged(gt);
					if (last)
						i915_request_put(last);

					err = PTR_ERR(rq);
					goto out;
				}

				if (last)
					i915_request_put(last);
				last = i915_request_get(rq);
				i915_request_add(rq);
			}

			if (count & 1) {
				err = intel_engine_reset(engine, NULL);
				if (err) {
					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
						      engine->name, err);
					GEM_TRACE_DUMP();
					i915_request_put(last);
					break;
				}
			} else {
				force_reset_timeout(engine);
				err = intel_engine_reset(engine, NULL);
				cancel_reset_timeout(engine);
				if (err != -ETIMEDOUT) {
					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
					       engine->name, err);
					i915_request_put(last);
					break;
				}
			}

			err = 0;
			if (last) {
				if (i915_request_wait(last, 0, HZ / 2) < 0) {
					struct drm_printer p =
						drm_info_printer(gt->i915->drm.dev);

					intel_engine_dump(engine, &p,
							  "%s(%s): failed to complete request\n",
							  __func__,
							  engine->name);

					GEM_TRACE("%s(%s): failed to complete request\n",
						  __func__,
						  engine->name);
					GEM_TRACE_DUMP();

					err = -EIO;
				}
				i915_request_put(last);
			}
			count++;
		} while (err == 0 && time_before(jiffies, end_time));
out:
		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
skip:
670
		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
671 672 673 674 675 676 677 678 679 680 681 682
		st_engine_heartbeat_enable(engine);
		intel_context_put(ce);

		if (igt_flush_test(gt->i915))
			err = -EIO;
		if (err)
			return err;
	}

	return 0;
}

683
static int __igt_reset_engine(struct intel_gt *gt, bool active)
684
{
685
	struct i915_gpu_error *global = &gt->i915->gpu_error;
686 687
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
688
	struct hang h;
689 690
	int err = 0;

691
	/* Check that we can issue an engine reset on an idle engine (no-op) */
692

693
	if (!intel_has_reset_engine(gt))
694 695
		return 0;

696
	if (active) {
697
		err = hang_init(&h, gt);
698 699 700 701
		if (err)
			return err;
	}

702
	for_each_engine(engine, gt, id) {
703
		unsigned int reset_count, reset_engine_count;
704
		unsigned long count;
705
		bool using_guc = intel_engine_uses_guc(engine);
706 707
		IGT_TIMEOUT(end_time);

708 709 710
		if (using_guc && !active)
			continue;

711 712 713
		if (active && !intel_engine_can_store_dword(engine))
			continue;

714 715 716 717 718 719 720
		if (!wait_for_idle(engine)) {
			pr_err("%s failed to idle before reset\n",
			       engine->name);
			err = -EIO;
			break;
		}

721 722
		reset_count = i915_reset_count(global);
		reset_engine_count = i915_reset_engine_count(global, engine);
723

724
		st_engine_heartbeat_disable(engine);
725 726
		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
					    &gt->reset.flags));
727
		count = 0;
728
		do {
729 730 731 732 733 734 735 736 737 738
			struct i915_request *rq = NULL;
			struct intel_selftest_saved_policy saved;
			int err2;

			err = intel_selftest_modify_policy(engine, &saved,
							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
			if (err) {
				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
				break;
			}
739

740
			if (active) {
741
				rq = hang_create_request(&h, engine);
742 743
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
744 745
					pr_err("[%s] Create hang request failed: %d!\n",
					       engine->name, err);
746
					goto restore;
747 748
				}

749
				i915_request_get(rq);
750
				i915_request_add(rq);
751

752
				if (!wait_until_running(&h, rq)) {
753
					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
754

755
					pr_err("%s: Failed to start request %llx, at %x\n",
756 757 758 759
					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);

760
					i915_request_put(rq);
761
					err = -EIO;
762
					goto restore;
763
				}
764
			}
765

766 767 768 769 770 771 772
			if (!using_guc) {
				err = intel_engine_reset(engine, NULL);
				if (err) {
					pr_err("intel_engine_reset(%s) failed, err:%d\n",
					       engine->name, err);
					goto skip;
				}
773 774
			}

775 776 777 778 779 780
			if (rq) {
				/* Ensure the reset happens and kills the engine */
				err = intel_selftest_wait_for_rq(rq);
				if (err)
					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
					       engine->name, rq->fence.context,
781
					       rq->fence.seqno, rq->context->guc_id.id, err);
782 783
			}

784 785 786 787
skip:
			if (rq)
				i915_request_put(rq);

788
			if (i915_reset_count(global) != reset_count) {
789 790
				pr_err("Full GPU reset recorded! (engine reset expected)\n");
				err = -EINVAL;
791
				goto restore;
792 793
			}

794 795 796 797 798 799 800 801 802
			/* GuC based resets are not logged per engine */
			if (!using_guc) {
				if (i915_reset_engine_count(global, engine) !=
				    ++reset_engine_count) {
					pr_err("%s engine reset not recorded!\n",
					       engine->name);
					err = -EINVAL;
					goto restore;
				}
803
			}
804 805

			count++;
806 807 808 809 810 811 812 813 814

restore:
			err2 = intel_selftest_restore_policy(engine, &saved);
			if (err2)
				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
			if (err == 0)
				err = err2;
			if (err)
				break;
815
		} while (time_before(jiffies, end_time));
816
		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
817
		st_engine_heartbeat_enable(engine);
818 819
		pr_info("%s: Completed %lu %s resets\n",
			engine->name, count, active ? "active" : "idle");
820

821
		if (err)
822 823
			break;

824
		err = igt_flush_test(gt->i915);
825 826
		if (err) {
			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
827
			break;
828
		}
829 830
	}

831 832
	if (intel_gt_is_wedged(gt)) {
		pr_err("GT is wedged!\n");
833
		err = -EIO;
834
	}
835

836
	if (active)
837 838
		hang_fini(&h);

839 840 841
	return err;
}

842 843 844 845 846 847 848 849 850 851
static int igt_reset_idle_engine(void *arg)
{
	return __igt_reset_engine(arg, false);
}

static int igt_reset_active_engine(void *arg)
{
	return __igt_reset_engine(arg, true);
}

852
struct active_engine {
853 854
	struct kthread_worker *worker;
	struct kthread_work work;
855 856 857
	struct intel_engine_cs *engine;
	unsigned long resets;
	unsigned int flags;
858 859
	bool stop;
	int result;
860 861 862 863 864 865 866
};

#define TEST_ACTIVE	BIT(0)
#define TEST_OTHERS	BIT(1)
#define TEST_SELF	BIT(2)
#define TEST_PRIORITY	BIT(3)

867 868 869 870 871 872 873
static int active_request_put(struct i915_request *rq)
{
	int err = 0;

	if (!rq)
		return 0;

874
	if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
875
		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
876 877
			  rq->engine->name,
			  rq->fence.context,
878
			  rq->fence.seqno);
879 880
		GEM_TRACE_DUMP();

881
		intel_gt_set_wedged(rq->engine->gt);
882 883 884 885 886 887 888 889
		err = -EIO;
	}

	i915_request_put(rq);

	return err;
}

890
static void active_engine(struct kthread_work *work)
891
{
892
	I915_RND_STATE(prng);
893
	struct active_engine *arg = container_of(work, typeof(*arg), work);
894 895
	struct intel_engine_cs *engine = arg->engine;
	struct i915_request *rq[8] = {};
896 897
	struct intel_context *ce[ARRAY_SIZE(rq)];
	unsigned long count;
898 899
	int err = 0;

900 901 902
	for (count = 0; count < ARRAY_SIZE(ce); count++) {
		ce[count] = intel_context_create(engine);
		if (IS_ERR(ce[count])) {
903 904 905
			arg->result = PTR_ERR(ce[count]);
			pr_err("[%s] Create context #%ld failed: %d!\n",
			       engine->name, count, arg->result);
906
			while (--count)
907
				intel_context_put(ce[count]);
908
			return;
909
		}
910 911
	}

912
	count = 0;
913
	while (!READ_ONCE(arg->stop)) {
914
		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
915 916
		struct i915_request *old = rq[idx];
		struct i915_request *new;
917

918
		new = intel_context_create_request(ce[idx]);
919 920
		if (IS_ERR(new)) {
			err = PTR_ERR(new);
921
			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
922 923 924
			break;
		}

925 926
		rq[idx] = i915_request_get(new);
		i915_request_add(new);
927

928
		if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
929 930 931 932
			struct i915_sched_attr attr = {
				.priority =
					i915_prandom_u32_max_state(512, &prng),
			};
933
			engine->sched_engine->schedule(rq[idx], &attr);
934 935
		}

936
		err = active_request_put(old);
937 938
		if (err) {
			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
939
			break;
940
		}
941 942

		cond_resched();
943 944
	}

945 946 947
	for (count = 0; count < ARRAY_SIZE(rq); count++) {
		int err__ = active_request_put(rq[count]);

948 949 950
		if (err)
			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);

951 952 953
		/* Keep the first error */
		if (!err)
			err = err__;
954 955

		intel_context_put(ce[count]);
956
	}
957

958
	arg->result = err;
959 960
}

961
static int __igt_reset_engines(struct intel_gt *gt,
962 963
			       const char *test_name,
			       unsigned int flags)
964
{
965
	struct i915_gpu_error *global = &gt->i915->gpu_error;
966
	struct intel_engine_cs *engine, *other;
967
	struct active_engine *threads;
968
	enum intel_engine_id id, tmp;
969
	struct hang h;
970 971 972 973 974 975
	int err = 0;

	/* Check that issuing a reset on one engine does not interfere
	 * with any other engine.
	 */

976
	if (!intel_has_reset_engine(gt))
977 978
		return 0;

979
	if (flags & TEST_ACTIVE) {
980
		err = hang_init(&h, gt);
981 982
		if (err)
			return err;
983 984

		if (flags & TEST_PRIORITY)
985
			h.ctx->sched.priority = 1024;
986 987
	}

988 989 990 991
	threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
	if (!threads)
		return -ENOMEM;

992
	for_each_engine(engine, gt, id) {
993
		unsigned long device = i915_reset_count(global);
994
		unsigned long count = 0, reported;
995
		bool using_guc = intel_engine_uses_guc(engine);
996 997
		IGT_TIMEOUT(end_time);

998 999 1000 1001
		if (flags & TEST_ACTIVE) {
			if (!intel_engine_can_store_dword(engine))
				continue;
		} else if (using_guc)
1002 1003
			continue;

1004 1005 1006 1007 1008 1009 1010
		if (!wait_for_idle(engine)) {
			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
			       engine->name, test_name);
			err = -EIO;
			break;
		}

1011
		memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
1012
		for_each_engine(other, gt, tmp) {
1013
			struct kthread_worker *worker;
1014

1015
			threads[tmp].resets =
1016
				i915_reset_engine_count(global, other);
1017

1018
			if (other == engine && !(flags & TEST_SELF))
1019 1020
				continue;

1021
			if (other != engine && !(flags & TEST_OTHERS))
1022 1023 1024 1025 1026
				continue;

			threads[tmp].engine = other;
			threads[tmp].flags = flags;

1027 1028 1029 1030 1031 1032
			worker = kthread_create_worker(0, "igt/%s",
						       other->name);
			if (IS_ERR(worker)) {
				err = PTR_ERR(worker);
				pr_err("[%s] Worker create failed: %d!\n",
				       engine->name, err);
1033 1034 1035
				goto unwind;
			}

1036
			threads[tmp].worker = worker;
1037

1038 1039 1040 1041
			kthread_init_work(&threads[tmp].work, active_engine);
			kthread_queue_work(threads[tmp].worker,
					   &threads[tmp].work);
		}
1042

1043
		st_engine_heartbeat_disable_no_pm(engine);
1044 1045
		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
					    &gt->reset.flags));
1046
		do {
1047
			struct i915_request *rq = NULL;
1048 1049 1050 1051 1052 1053 1054 1055 1056
			struct intel_selftest_saved_policy saved;
			int err2;

			err = intel_selftest_modify_policy(engine, &saved,
							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
			if (err) {
				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
				break;
			}
1057

1058
			if (flags & TEST_ACTIVE) {
1059
				rq = hang_create_request(&h, engine);
1060 1061
				if (IS_ERR(rq)) {
					err = PTR_ERR(rq);
1062 1063
					pr_err("[%s] Create hang request failed: %d!\n",
					       engine->name, err);
1064
					goto restore;
1065 1066
				}

1067
				i915_request_get(rq);
1068
				i915_request_add(rq);
1069

1070
				if (!wait_until_running(&h, rq)) {
1071
					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1072

1073
					pr_err("%s: Failed to start request %llx, at %x\n",
1074 1075 1076 1077
					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);

1078
					i915_request_put(rq);
1079
					err = -EIO;
1080
					goto restore;
1081
				}
1082 1083
			} else {
				intel_engine_pm_get(engine);
1084 1085
			}

1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
			if (!using_guc) {
				err = intel_engine_reset(engine, NULL);
				if (err) {
					pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
					       engine->name, test_name, err);
					goto restore;
				}
			}

			if (rq) {
				/* Ensure the reset happens and kills the engine */
				err = intel_selftest_wait_for_rq(rq);
				if (err)
					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
					       engine->name, rq->fence.context,
1101
					       rq->fence.seqno, rq->context->guc_id.id, err);
1102
			}
1103 1104

			count++;
1105 1106

			if (rq) {
1107
				if (rq->fence.error != -EIO) {
1108
					pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1109 1110
					       engine->name, test_name,
					       rq->fence.context,
1111
					       rq->fence.seqno, rq->context->guc_id.id);
1112 1113 1114 1115 1116
					i915_request_put(rq);

					GEM_TRACE_DUMP();
					intel_gt_set_wedged(gt);
					err = -EIO;
1117
					goto restore;
1118 1119
				}

1120 1121
				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
					struct drm_printer p =
1122
						drm_info_printer(gt->i915->drm.dev);
1123 1124

					pr_err("i915_reset_engine(%s:%s):"
1125 1126 1127 1128
					       " failed to complete request %llx:%lld after reset\n",
					       engine->name, test_name,
					       rq->fence.context,
					       rq->fence.seqno);
1129 1130 1131 1132 1133
					intel_engine_dump(engine, &p,
							  "%s\n", engine->name);
					i915_request_put(rq);

					GEM_TRACE_DUMP();
1134
					intel_gt_set_wedged(gt);
1135
					err = -EIO;
1136
					goto restore;
1137 1138
				}

1139 1140
				i915_request_put(rq);
			}
1141

1142 1143 1144
			if (!(flags & TEST_ACTIVE))
				intel_engine_pm_put(engine);

1145 1146
			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
				struct drm_printer p =
1147
					drm_info_printer(gt->i915->drm.dev);
1148 1149 1150 1151 1152 1153 1154 1155

				pr_err("i915_reset_engine(%s:%s):"
				       " failed to idle after reset\n",
				       engine->name, test_name);
				intel_engine_dump(engine, &p,
						  "%s\n", engine->name);

				err = -EIO;
1156
				goto restore;
1157
			}
1158 1159 1160 1161 1162 1163 1164 1165 1166

restore:
			err2 = intel_selftest_restore_policy(engine, &saved);
			if (err2)
				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
			if (err == 0)
				err = err2;
			if (err)
				break;
1167
		} while (time_before(jiffies, end_time));
1168
		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1169
		st_engine_heartbeat_enable_no_pm(engine);
1170

1171
		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1172 1173
			engine->name, test_name, count);

1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
		/* GuC based resets are not logged per engine */
		if (!using_guc) {
			reported = i915_reset_engine_count(global, engine);
			reported -= threads[engine->id].resets;
			if (reported != count) {
				pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
				       engine->name, test_name, count, reported);
				if (!err)
					err = -EINVAL;
			}
1184
		}
1185 1186

unwind:
1187
		for_each_engine(other, gt, tmp) {
1188 1189
			int ret;

1190
			if (!threads[tmp].worker)
1191 1192
				continue;

1193 1194 1195
			WRITE_ONCE(threads[tmp].stop, true);
			kthread_flush_work(&threads[tmp].work);
			ret = READ_ONCE(threads[tmp].result);
1196
			if (ret) {
1197 1198
				pr_err("kthread for other engine %s failed, err=%d\n",
				       other->name, ret);
1199 1200 1201
				if (!err)
					err = ret;
			}
1202 1203

			kthread_destroy_worker(threads[tmp].worker);
1204

1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
			/* GuC based resets are not logged per engine */
			if (!using_guc) {
				if (other->uabi_class != engine->uabi_class &&
				    threads[tmp].resets !=
				    i915_reset_engine_count(global, other)) {
					pr_err("Innocent engine %s was reset (count=%ld)\n",
					       other->name,
					       i915_reset_engine_count(global, other) -
					       threads[tmp].resets);
					if (!err)
						err = -EINVAL;
				}
1217 1218 1219
			}
		}

1220
		if (device != i915_reset_count(global)) {
1221
			pr_err("Global reset (count=%ld)!\n",
1222
			       i915_reset_count(global) - device);
1223 1224
			if (!err)
				err = -EINVAL;
1225 1226 1227 1228 1229
		}

		if (err)
			break;

1230
		err = igt_flush_test(gt->i915);
1231 1232
		if (err) {
			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1233
			break;
1234
		}
1235
	}
1236
	kfree(threads);
1237

1238
	if (intel_gt_is_wedged(gt))
1239 1240
		err = -EIO;

1241
	if (flags & TEST_ACTIVE)
1242 1243
		hang_fini(&h);

1244 1245 1246
	return err;
}

1247
static int igt_reset_engines(void *arg)
1248
{
1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262
	static const struct {
		const char *name;
		unsigned int flags;
	} phases[] = {
		{ "idle", 0 },
		{ "active", TEST_ACTIVE },
		{ "others-idle", TEST_OTHERS },
		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
		{
			"others-priority",
			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
		},
		{
			"self-priority",
1263
			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1264 1265 1266
		},
		{ }
	};
1267
	struct intel_gt *gt = arg;
1268 1269
	typeof(*phases) *p;
	int err;
1270

1271 1272
	for (p = phases; p->name; p++) {
		if (p->flags & TEST_PRIORITY) {
1273
			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1274 1275 1276 1277 1278 1279 1280 1281 1282
				continue;
		}

		err = __igt_reset_engines(arg, p->name, p->flags);
		if (err)
			return err;
	}

	return 0;
1283 1284
}

1285
static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1286
{
1287
	u32 count = i915_reset_count(&gt->i915->gpu_error);
1288

1289
	intel_gt_reset(gt, mask, NULL);
1290

1291
	return count;
1292 1293
}

1294
static int igt_reset_wait(void *arg)
1295
{
1296 1297
	struct intel_gt *gt = arg;
	struct i915_gpu_error *global = &gt->i915->gpu_error;
1298
	struct intel_engine_cs *engine;
1299
	struct i915_request *rq;
1300 1301 1302 1303 1304
	unsigned int reset_count;
	struct hang h;
	long timeout;
	int err;

1305 1306
	engine = intel_selftest_find_any_engine(gt);

1307
	if (!engine || !intel_engine_can_store_dword(engine))
1308 1309
		return 0;

1310 1311
	/* Check that we detect a stuck waiter and issue a reset */

1312
	igt_global_reset_lock(gt);
1313

1314
	err = hang_init(&h, gt);
1315 1316
	if (err) {
		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1317
		goto unlock;
1318
	}
1319

1320
	rq = hang_create_request(&h, engine);
1321 1322
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
1323
		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1324 1325 1326
		goto fini;
	}

1327
	i915_request_get(rq);
1328
	i915_request_add(rq);
1329

1330
	if (!wait_until_running(&h, rq)) {
1331
		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1332

1333
		pr_err("%s: Failed to start request %llx, at %x\n",
1334
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1335
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1336

1337
		intel_gt_set_wedged(gt);
1338

1339 1340 1341 1342
		err = -EIO;
		goto out_rq;
	}

1343
	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1344

1345
	timeout = i915_request_wait(rq, 0, 10);
1346
	if (timeout < 0) {
1347
		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1348 1349 1350 1351 1352
		       timeout);
		err = timeout;
		goto out_rq;
	}

1353
	if (i915_reset_count(global) == reset_count) {
1354 1355 1356 1357 1358 1359
		pr_err("No GPU reset recorded!\n");
		err = -EINVAL;
		goto out_rq;
	}

out_rq:
1360
	i915_request_put(rq);
1361 1362 1363
fini:
	hang_fini(&h);
unlock:
1364
	igt_global_reset_unlock(gt);
1365

1366
	if (intel_gt_is_wedged(gt))
1367 1368 1369 1370 1371
		return -EIO;

	return err;
}

1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385
struct evict_vma {
	struct completion completion;
	struct i915_vma *vma;
};

static int evict_vma(void *data)
{
	struct evict_vma *arg = data;
	struct i915_address_space *vm = arg->vma->vm;
	struct drm_mm_node evict = arg->vma->node;
	int err;

	complete(&arg->completion);

1386
	mutex_lock(&vm->mutex);
1387
	err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1388
	mutex_unlock(&vm->mutex);
1389 1390 1391 1392

	return err;
}

1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403
static int evict_fence(void *data)
{
	struct evict_vma *arg = data;
	int err;

	complete(&arg->completion);

	/* Mark the fence register as dirty to force the mmio update. */
	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
	if (err) {
		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1404
		return err;
1405 1406
	}

1407 1408 1409
	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
	if (err) {
		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1410
		return err;
1411 1412
	}

1413
	err = i915_vma_pin_fence(arg->vma);
1414
	i915_vma_unpin(arg->vma);
1415 1416
	if (err) {
		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1417
		return err;
1418 1419 1420 1421
	}

	i915_vma_unpin_fence(arg->vma);

1422
	return 0;
1423 1424
}

1425
static int __igt_reset_evict_vma(struct intel_gt *gt,
1426 1427 1428
				 struct i915_address_space *vm,
				 int (*fn)(void *),
				 unsigned int flags)
1429
{
1430
	struct intel_engine_cs *engine;
1431 1432 1433 1434 1435
	struct drm_i915_gem_object *obj;
	struct task_struct *tsk = NULL;
	struct i915_request *rq;
	struct evict_vma arg;
	struct hang h;
1436
	unsigned int pin_flags;
1437 1438
	int err;

1439 1440 1441
	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
		return 0;

1442 1443
	engine = intel_selftest_find_any_engine(gt);

1444
	if (!engine || !intel_engine_can_store_dword(engine))
1445 1446 1447 1448
		return 0;

	/* Check that we can recover an unbind stuck on a hanging request */

1449
	err = hang_init(&h, gt);
1450 1451
	if (err) {
		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1452
		return err;
1453
	}
1454

1455
	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1456 1457
	if (IS_ERR(obj)) {
		err = PTR_ERR(obj);
1458
		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1459 1460 1461
		goto fini;
	}

1462 1463 1464 1465 1466 1467 1468 1469
	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
		if (err) {
			pr_err("Invalid X-tiling settings; err:%d\n", err);
			goto out_obj;
		}
	}

1470 1471 1472
	arg.vma = i915_vma_instance(obj, vm, NULL);
	if (IS_ERR(arg.vma)) {
		err = PTR_ERR(arg.vma);
1473
		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1474 1475 1476
		goto out_obj;
	}

1477
	rq = hang_create_request(&h, engine);
1478 1479
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
1480
		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1481 1482 1483
		goto out_obj;
	}

1484 1485 1486 1487 1488 1489
	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;

	if (flags & EXEC_OBJECT_NEEDS_FENCE)
		pin_flags |= PIN_MAPPABLE;

	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1490 1491
	if (err) {
		i915_request_add(rq);
1492
		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1493
		goto out_obj;
1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504
	}

	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
		err = i915_vma_pin_fence(arg.vma);
		if (err) {
			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
			i915_vma_unpin(arg.vma);
			i915_request_add(rq);
			goto out_obj;
		}
	}
1505

1506
	err = igt_vma_move_to_active_unlocked(arg.vma, rq, flags);
1507 1508
	if (err)
		pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1509

1510 1511
	if (flags & EXEC_OBJECT_NEEDS_FENCE)
		i915_vma_unpin_fence(arg.vma);
1512 1513 1514 1515 1516 1517 1518 1519
	i915_vma_unpin(arg.vma);

	i915_request_get(rq);
	i915_request_add(rq);
	if (err)
		goto out_rq;

	if (!wait_until_running(&h, rq)) {
1520
		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1521

1522
		pr_err("%s: Failed to start request %llx, at %x\n",
1523 1524 1525
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);

1526
		intel_gt_set_wedged(gt);
1527 1528 1529 1530 1531
		goto out_reset;
	}

	init_completion(&arg.completion);

1532
	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1533 1534
	if (IS_ERR(tsk)) {
		err = PTR_ERR(tsk);
1535
		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1536 1537 1538
		tsk = NULL;
		goto out_reset;
	}
1539
	get_task_struct(tsk);
1540 1541 1542

	wait_for_completion(&arg.completion);

1543
	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1544
		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1545 1546 1547 1548

		pr_err("igt/evict_vma kthread did not wait\n");
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);

1549
		intel_gt_set_wedged(gt);
1550 1551 1552 1553
		goto out_reset;
	}

out_reset:
1554 1555 1556
	igt_global_reset_lock(gt);
	fake_hangcheck(gt, rq->engine->mask);
	igt_global_reset_unlock(gt);
1557 1558

	if (tsk) {
1559
		struct intel_wedge_me w;
1560 1561

		/* The reset, even indirectly, should take less than 10ms. */
1562
		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1563
			err = kthread_stop(tsk);
1564 1565

		put_task_struct(tsk);
1566 1567 1568 1569 1570 1571 1572 1573
	}

out_rq:
	i915_request_put(rq);
out_obj:
	i915_gem_object_put(obj);
fini:
	hang_fini(&h);
1574
	if (intel_gt_is_wedged(gt))
1575 1576 1577 1578 1579 1580 1581
		return -EIO;

	return err;
}

static int igt_reset_evict_ggtt(void *arg)
{
1582
	struct intel_gt *gt = arg;
1583

1584
	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1585
				     evict_vma, EXEC_OBJECT_WRITE);
1586 1587 1588 1589
}

static int igt_reset_evict_ppgtt(void *arg)
{
1590
	struct intel_gt *gt = arg;
1591
	struct i915_ppgtt *ppgtt;
1592 1593
	int err;

1594 1595 1596
	/* aliasing == global gtt locking, covered above */
	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
		return 0;
1597

1598
	ppgtt = i915_ppgtt_create(gt, 0);
1599 1600
	if (IS_ERR(ppgtt))
		return PTR_ERR(ppgtt);
1601

1602 1603 1604
	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
				    evict_vma, EXEC_OBJECT_WRITE);
	i915_vm_put(&ppgtt->vm);
1605 1606 1607 1608

	return err;
}

1609 1610
static int igt_reset_evict_fence(void *arg)
{
1611
	struct intel_gt *gt = arg;
1612

1613
	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1614 1615 1616
				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
}

1617
static int wait_for_others(struct intel_gt *gt,
1618 1619 1620 1621 1622
			   struct intel_engine_cs *exclude)
{
	struct intel_engine_cs *engine;
	enum intel_engine_id id;

1623
	for_each_engine(engine, gt, id) {
1624 1625 1626
		if (engine == exclude)
			continue;

1627
		if (!wait_for_idle(engine))
1628 1629 1630 1631 1632 1633
			return -EIO;
	}

	return 0;
}

1634 1635
static int igt_reset_queue(void *arg)
{
1636 1637
	struct intel_gt *gt = arg;
	struct i915_gpu_error *global = &gt->i915->gpu_error;
1638 1639 1640 1641 1642 1643 1644
	struct intel_engine_cs *engine;
	enum intel_engine_id id;
	struct hang h;
	int err;

	/* Check that we replay pending requests following a hang */

1645
	igt_global_reset_lock(gt);
1646

1647
	err = hang_init(&h, gt);
1648 1649 1650
	if (err)
		goto unlock;

1651
	for_each_engine(engine, gt, id) {
1652
		struct intel_selftest_saved_policy saved;
1653
		struct i915_request *prev;
1654 1655
		IGT_TIMEOUT(end_time);
		unsigned int count;
1656
		bool using_guc = intel_engine_uses_guc(engine);
1657

1658 1659 1660
		if (!intel_engine_can_store_dword(engine))
			continue;

1661 1662 1663 1664 1665 1666 1667 1668 1669
		if (using_guc) {
			err = intel_selftest_modify_policy(engine, &saved,
							   SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
			if (err) {
				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
				goto fini;
			}
		}

1670
		prev = hang_create_request(&h, engine);
1671 1672
		if (IS_ERR(prev)) {
			err = PTR_ERR(prev);
1673
			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1674
			goto restore;
1675 1676
		}

1677
		i915_request_get(prev);
1678
		i915_request_add(prev);
1679 1680 1681

		count = 0;
		do {
1682
			struct i915_request *rq;
1683 1684
			unsigned int reset_count;

1685
			rq = hang_create_request(&h, engine);
1686 1687
			if (IS_ERR(rq)) {
				err = PTR_ERR(rq);
1688
				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1689
				goto restore;
1690 1691
			}

1692
			i915_request_get(rq);
1693
			i915_request_add(rq);
1694

1695 1696 1697 1698 1699 1700 1701 1702 1703 1704
			/*
			 * XXX We don't handle resetting the kernel context
			 * very well. If we trigger a device reset twice in
			 * quick succession while the kernel context is
			 * executing, we may end up skipping the breadcrumb.
			 * This is really only a problem for the selftest as
			 * normally there is a large interlude between resets
			 * (hangcheck), or we focus on resetting just one
			 * engine and so avoid repeatedly resetting innocents.
			 */
1705
			err = wait_for_others(gt, engine);
1706 1707 1708 1709 1710 1711 1712
			if (err) {
				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
				       __func__, engine->name);
				i915_request_put(rq);
				i915_request_put(prev);

				GEM_TRACE_DUMP();
1713
				intel_gt_set_wedged(gt);
1714
				goto restore;
1715 1716
			}

1717
			if (!wait_until_running(&h, prev)) {
1718
				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1719

1720
				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1721 1722 1723 1724
				       __func__, engine->name,
				       prev->fence.seqno, hws_seqno(&h, prev));
				intel_engine_dump(engine, &p,
						  "%s\n", engine->name);
1725

1726 1727
				i915_request_put(rq);
				i915_request_put(prev);
1728

1729
				intel_gt_set_wedged(gt);
1730

1731
				err = -EIO;
1732
				goto restore;
1733 1734
			}

1735
			reset_count = fake_hangcheck(gt, BIT(id));
1736

1737 1738 1739
			if (prev->fence.error != -EIO) {
				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
				       prev->fence.error);
1740 1741
				i915_request_put(rq);
				i915_request_put(prev);
1742
				err = -EINVAL;
1743
				goto restore;
1744 1745 1746 1747 1748
			}

			if (rq->fence.error) {
				pr_err("Fence error status not zero [%d] after unrelated reset\n",
				       rq->fence.error);
1749 1750
				i915_request_put(rq);
				i915_request_put(prev);
1751
				err = -EINVAL;
1752
				goto restore;
1753 1754
			}

1755
			if (i915_reset_count(global) == reset_count) {
1756
				pr_err("No GPU reset recorded!\n");
1757 1758
				i915_request_put(rq);
				i915_request_put(prev);
1759
				err = -EINVAL;
1760
				goto restore;
1761 1762
			}

1763
			i915_request_put(prev);
1764 1765 1766
			prev = rq;
			count++;
		} while (time_before(jiffies, end_time));
1767 1768
		pr_info("%s: Completed %d queued resets\n",
			engine->name, count);
1769 1770

		*h.batch = MI_BATCH_BUFFER_END;
1771
		intel_gt_chipset_flush(engine->gt);
1772

1773
		i915_request_put(prev);
1774

1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
restore:
		if (using_guc) {
			int err2 = intel_selftest_restore_policy(engine, &saved);

			if (err2)
				pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
				       __func__, __LINE__, engine->name, err2);
			if (err == 0)
				err = err2;
		}
		if (err)
			goto fini;

1788
		err = igt_flush_test(gt->i915);
1789 1790
		if (err) {
			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1791
			break;
1792
		}
1793 1794 1795 1796 1797
	}

fini:
	hang_fini(&h);
unlock:
1798
	igt_global_reset_unlock(gt);
1799

1800
	if (intel_gt_is_wedged(gt))
1801 1802 1803 1804 1805
		return -EIO;

	return err;
}

1806
static int igt_handle_error(void *arg)
1807
{
1808 1809
	struct intel_gt *gt = arg;
	struct i915_gpu_error *global = &gt->i915->gpu_error;
1810
	struct intel_engine_cs *engine;
1811
	struct hang h;
1812
	struct i915_request *rq;
1813
	struct i915_gpu_coredump *error;
1814
	int err;
1815

1816 1817
	engine = intel_selftest_find_any_engine(gt);

1818 1819
	/* Check that we can issue a global GPU and engine reset */

1820
	if (!intel_has_reset_engine(gt))
1821 1822
		return 0;

1823
	if (!engine || !intel_engine_can_store_dword(engine))
1824 1825
		return 0;

1826
	err = hang_init(&h, gt);
1827 1828
	if (err) {
		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1829
		return err;
1830
	}
1831

1832
	rq = hang_create_request(&h, engine);
1833 1834
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
1835
		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1836
		goto err_fini;
1837 1838
	}

1839
	i915_request_get(rq);
1840
	i915_request_add(rq);
1841

1842
	if (!wait_until_running(&h, rq)) {
1843
		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1844

1845
		pr_err("%s: Failed to start request %llx, at %x\n",
1846
		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1847
		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1848

1849
		intel_gt_set_wedged(gt);
1850

1851
		err = -EIO;
1852
		goto err_request;
1853 1854
	}

1855
	/* Temporarily disable error capture */
1856
	error = xchg(&global->first_error, (void *)-1);
1857

1858
	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1859

1860
	xchg(&global->first_error, error);
1861

1862 1863 1864 1865 1866
	if (rq->fence.error != -EIO) {
		pr_err("Guilty request not identified!\n");
		err = -EINVAL;
		goto err_request;
	}
1867 1868

err_request:
1869
	i915_request_put(rq);
1870 1871
err_fini:
	hang_fini(&h);
1872
	return err;
1873 1874
}

1875
static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1876
				     const struct igt_atomic_section *p,
1877 1878
				     const char *mode)
{
1879
	struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1880 1881 1882 1883 1884
	int err;

	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
		  engine->name, mode, p->name);

1885 1886
	if (t->func)
		tasklet_disable(t);
1887 1888
	if (strcmp(p->name, "softirq"))
		local_bh_disable();
1889 1890
	p->critical_section_begin();

1891
	err = __intel_engine_reset_bh(engine, NULL);
1892 1893

	p->critical_section_end();
1894 1895
	if (strcmp(p->name, "softirq"))
		local_bh_enable();
1896 1897 1898 1899
	if (t->func) {
		tasklet_enable(t);
		tasklet_hi_schedule(t);
	}
1900 1901 1902 1903 1904 1905 1906 1907 1908

	if (err)
		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
		       engine->name, mode, p->name);

	return err;
}

static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1909
				   const struct igt_atomic_section *p)
1910 1911 1912 1913 1914 1915 1916 1917 1918
{
	struct i915_request *rq;
	struct hang h;
	int err;

	err = __igt_atomic_reset_engine(engine, p, "idle");
	if (err)
		return err;

1919
	err = hang_init(&h, engine->gt);
1920 1921
	if (err) {
		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1922
		return err;
1923
	}
1924 1925 1926 1927

	rq = hang_create_request(&h, engine);
	if (IS_ERR(rq)) {
		err = PTR_ERR(rq);
1928
		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940
		goto out;
	}

	i915_request_get(rq);
	i915_request_add(rq);

	if (wait_until_running(&h, rq)) {
		err = __igt_atomic_reset_engine(engine, p, "active");
	} else {
		pr_err("%s(%s): Failed to start request %llx, at %x\n",
		       __func__, engine->name,
		       rq->fence.seqno, hws_seqno(&h, rq));
1941
		intel_gt_set_wedged(engine->gt);
1942 1943 1944 1945
		err = -EIO;
	}

	if (err == 0) {
1946
		struct intel_wedge_me w;
1947

1948
		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1949
			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1950
		if (intel_gt_is_wedged(engine->gt))
1951 1952 1953 1954 1955 1956 1957 1958 1959
			err = -EIO;
	}

	i915_request_put(rq);
out:
	hang_fini(&h);
	return err;
}

1960
static int igt_reset_engines_atomic(void *arg)
1961
{
1962
	struct intel_gt *gt = arg;
1963
	const typeof(*igt_atomic_phases) *p;
1964 1965
	int err = 0;

1966 1967
	/* Check that the engines resets are usable from atomic context */

1968
	if (!intel_has_reset_engine(gt))
1969 1970
		return 0;

1971
	if (intel_uc_uses_guc_submission(&gt->uc))
1972
		return 0;
1973

1974
	igt_global_reset_lock(gt);
1975 1976

	/* Flush any requests before we get started and check basics */
1977
	if (!igt_force_reset(gt))
1978 1979
		goto unlock;

1980
	for (p = igt_atomic_phases; p->name; p++) {
1981 1982 1983
		struct intel_engine_cs *engine;
		enum intel_engine_id id;

1984
		for_each_engine(engine, gt, id) {
1985 1986 1987
			err = igt_atomic_reset_engine(engine, p);
			if (err)
				goto out;
1988 1989 1990 1991 1992
		}
	}

out:
	/* As we poke around the guts, do a full reset before continuing. */
1993
	igt_force_reset(gt);
1994
unlock:
1995
	igt_global_reset_unlock(gt);
1996 1997 1998 1999

	return err;
}

2000 2001 2002 2003
int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
{
	static const struct i915_subtest tests[] = {
		SUBTEST(igt_hang_sanitycheck),
2004 2005
		SUBTEST(igt_reset_nop),
		SUBTEST(igt_reset_nop_engine),
2006 2007
		SUBTEST(igt_reset_idle_engine),
		SUBTEST(igt_reset_active_engine),
2008
		SUBTEST(igt_reset_fail_engine),
2009
		SUBTEST(igt_reset_engines),
2010
		SUBTEST(igt_reset_engines_atomic),
2011
		SUBTEST(igt_reset_queue),
2012 2013 2014
		SUBTEST(igt_reset_wait),
		SUBTEST(igt_reset_evict_ggtt),
		SUBTEST(igt_reset_evict_ppgtt),
2015
		SUBTEST(igt_reset_evict_fence),
2016
		SUBTEST(igt_handle_error),
2017
	};
2018
	struct intel_gt *gt = to_gt(i915);
2019
	intel_wakeref_t wakeref;
2020
	int err;
2021

2022
	if (!intel_has_gpu_reset(gt))
2023 2024
		return 0;

2025
	if (intel_gt_is_wedged(gt))
2026 2027
		return -EIO; /* we're long past hope of a successful reset */

2028
	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2029

2030
	err = intel_gt_live_subtests(tests, gt);
2031

2032
	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2033 2034

	return err;
2035
}