kfd_svm.c 114 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
 * Copyright 2020-2021 Advanced Micro Devices, Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include <linux/types.h>
25
#include <linux/sched/task.h>
26
#include <linux/dynamic_debug.h>
27
#include <drm/ttm/ttm_tt.h>
28 29
#include <drm/drm_exec.h>

30 31 32
#include "amdgpu_sync.h"
#include "amdgpu_object.h"
#include "amdgpu_vm.h"
33
#include "amdgpu_hmm.h"
34 35
#include "amdgpu.h"
#include "amdgpu_xgmi.h"
36 37
#include "kfd_priv.h"
#include "kfd_svm.h"
38
#include "kfd_migrate.h"
39
#include "kfd_smi_events.h"
40

41 42 43 44 45
#ifdef dev_fmt
#undef dev_fmt
#endif
#define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__

46 47
#define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1

48 49 50
/* Long enough to ensure no retry fault comes after svm range is restored and
 * page table is updated.
 */
51
#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING	(2UL * NSEC_PER_MSEC)
52
#if IS_ENABLED(CONFIG_DYNAMIC_DEBUG)
53 54
#define dynamic_svm_range_dump(svms) \
	_dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms)
55 56 57 58
#else
#define dynamic_svm_range_dump(svms) \
	do { if (0) svm_range_debug_dump(svms); } while (0)
#endif
59

60 61 62 63 64 65
/* Giant svm range split into smaller ranges based on this, it is decided using
 * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to
 * power of 2MB.
 */
static uint64_t max_svm_range_pages;

66 67 68 69 70
struct criu_svm_metadata {
	struct list_head list;
	struct kfd_criu_svm_range_priv_data data;
};

71
static void svm_range_evict_svm_bo_worker(struct work_struct *work);
72 73 74 75
static bool
svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
				    const struct mmu_notifier_range *range,
				    unsigned long cur_seq);
76 77 78
static int
svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
		   uint64_t *bo_s, uint64_t *bo_l);
79 80 81 82
static const struct mmu_interval_notifier_ops svm_range_mn_ops = {
	.invalidate = svm_range_cpu_invalidate_pagetables,
};

83 84 85 86
/**
 * svm_range_unlink - unlink svm_range from lists and interval tree
 * @prange: svm range structure to be removed
 *
87 88
 * Remove the svm_range from the svms and svm_bo lists and the svms
 * interval tree.
89 90 91 92 93 94 95 96
 *
 * Context: The caller must hold svms->lock
 */
static void svm_range_unlink(struct svm_range *prange)
{
	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
		 prange, prange->start, prange->last);

97 98 99 100 101 102
	if (prange->svm_bo) {
		spin_lock(&prange->svm_bo->list_lock);
		list_del(&prange->svm_bo_list);
		spin_unlock(&prange->svm_bo->list_lock);
	}

103 104 105 106 107
	list_del(&prange->list);
	if (prange->it_node.start != 0 && prange->it_node.last != 0)
		interval_tree_remove(&prange->it_node, &prange->svms->objects);
}

108 109 110 111 112 113 114 115 116 117 118 119
static void
svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange)
{
	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
		 prange, prange->start, prange->last);

	mmu_interval_notifier_insert_locked(&prange->notifier, mm,
				     prange->start << PAGE_SHIFT,
				     prange->npages << PAGE_SHIFT,
				     &svm_range_mn_ops);
}

120 121 122 123 124 125 126 127 128 129 130 131 132
/**
 * svm_range_add_to_svms - add svm range to svms
 * @prange: svm range structure to be added
 *
 * Add the svm range to svms interval tree and link list
 *
 * Context: The caller must hold svms->lock
 */
static void svm_range_add_to_svms(struct svm_range *prange)
{
	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms,
		 prange, prange->start, prange->last);

133
	list_move_tail(&prange->list, &prange->svms->list);
134 135 136 137 138
	prange->it_node.start = prange->start;
	prange->it_node.last = prange->last;
	interval_tree_insert(&prange->it_node, &prange->svms->objects);
}

139 140 141 142 143 144 145 146 147 148 149 150
static void svm_range_remove_notifier(struct svm_range *prange)
{
	pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n",
		 prange->svms, prange,
		 prange->notifier.interval_tree.start >> PAGE_SHIFT,
		 prange->notifier.interval_tree.last >> PAGE_SHIFT);

	if (prange->notifier.interval_tree.start != 0 &&
	    prange->notifier.interval_tree.last != 0)
		mmu_interval_notifier_remove(&prange->notifier);
}

151 152 153 154 155 156 157
static bool
svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr)
{
	return dma_addr && !dma_mapping_error(dev, dma_addr) &&
	       !(dma_addr & SVM_RANGE_VRAM_DOMAIN);
}

158
static int
159
svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
160
		      unsigned long offset, unsigned long npages,
161
		      unsigned long *hmm_pfns, uint32_t gpuidx)
162 163
{
	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
164 165
	dma_addr_t *addr = prange->dma_addr[gpuidx];
	struct device *dev = adev->dev;
166 167 168 169
	struct page *page;
	int i, r;

	if (!addr) {
170
		addr = kvcalloc(prange->npages, sizeof(*addr), GFP_KERNEL);
171 172
		if (!addr)
			return -ENOMEM;
173
		prange->dma_addr[gpuidx] = addr;
174 175
	}

176 177
	addr += offset;
	for (i = 0; i < npages; i++) {
178
		if (svm_is_valid_dma_mapping_addr(dev, addr[i]))
179 180 181
			dma_unmap_page(dev, addr[i], PAGE_SIZE, dir);

		page = hmm_pfn_to_page(hmm_pfns[i]);
182
		if (is_zone_device_page(page)) {
183
			struct amdgpu_device *bo_adev = prange->svm_bo->node->adev;
184 185 186

			addr[i] = (hmm_pfns[i] << PAGE_SHIFT) +
				   bo_adev->vm_manager.vram_base_offset -
187
				   bo_adev->kfd.pgmap.range.start;
188
			addr[i] |= SVM_RANGE_VRAM_DOMAIN;
189
			pr_debug_ratelimited("vram address: 0x%llx\n", addr[i]);
190 191
			continue;
		}
192 193 194
		addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
		r = dma_mapping_error(dev, addr[i]);
		if (r) {
195
			dev_err(dev, "failed %d dma_map_page\n", r);
196 197
			return r;
		}
198 199
		pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n",
				     addr[i] >> PAGE_SHIFT, page_to_pfn(page));
200
	}
201

202 203 204 205 206
	return 0;
}

static int
svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap,
207
		  unsigned long offset, unsigned long npages,
208
		  unsigned long *hmm_pfns)
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
{
	struct kfd_process *p;
	uint32_t gpuidx;
	int r;

	p = container_of(prange->svms, struct kfd_process, svms);

	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
		struct kfd_process_device *pdd;

		pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
		if (!pdd) {
			pr_debug("failed to find device idx %d\n", gpuidx);
			return -EINVAL;
		}

226
		r = svm_range_dma_map_dev(pdd->dev->adev, prange, offset, npages,
227
					  hmm_pfns, gpuidx);
228 229 230 231 232 233 234
		if (r)
			break;
	}

	return r;
}

235
void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr,
236
			 unsigned long offset, unsigned long npages)
237 238 239 240 241 242 243 244
{
	enum dma_data_direction dir = DMA_BIDIRECTIONAL;
	int i;

	if (!dma_addr)
		return;

	for (i = offset; i < offset + npages; i++) {
245
		if (!svm_is_valid_dma_mapping_addr(dev, dma_addr[i]))
246
			continue;
247
		pr_debug_ratelimited("unmap 0x%llx\n", dma_addr[i] >> PAGE_SHIFT);
248 249 250 251 252
		dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
		dma_addr[i] = 0;
	}
}

253
void svm_range_dma_unmap(struct svm_range *prange)
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
{
	struct kfd_process_device *pdd;
	dma_addr_t *dma_addr;
	struct device *dev;
	struct kfd_process *p;
	uint32_t gpuidx;

	p = container_of(prange->svms, struct kfd_process, svms);

	for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
		dma_addr = prange->dma_addr[gpuidx];
		if (!dma_addr)
			continue;

		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
		if (!pdd) {
			pr_debug("failed to find device idx %d\n", gpuidx);
			continue;
		}
273
		dev = &pdd->dev->adev->pdev->dev;
274 275

		svm_range_dma_unmap_dev(dev, dma_addr, 0, prange->npages);
276 277 278
	}
}

279
static void svm_range_free(struct svm_range *prange, bool do_unmap)
280
{
281 282
	uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT;
	struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms);
283
	uint32_t gpuidx;
284

285 286 287
	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange,
		 prange->start, prange->last);

288
	svm_range_vram_node_free(prange);
289 290
	if (do_unmap)
		svm_range_dma_unmap(prange);
291

292
	if (do_unmap && !p->xnack_enabled) {
293
		pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size);
294
		amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
295
					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
296
	}
297 298 299 300 301

	/* free dma_addr array for each gpu */
	for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) {
		if (prange->dma_addr[gpuidx]) {
			kvfree(prange->dma_addr[gpuidx]);
302
			prange->dma_addr[gpuidx] = NULL;
303 304 305
		}
	}

306
	mutex_destroy(&prange->lock);
307
	mutex_destroy(&prange->migrate_mutex);
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
	kfree(prange);
}

static void
svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc,
				 uint8_t *granularity, uint32_t *flags)
{
	*location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
	*prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
	*granularity = 9;
	*flags =
		KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT;
}

static struct
svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start,
324
			 uint64_t last, bool update_mem_usage)
325 326 327
{
	uint64_t size = last - start + 1;
	struct svm_range *prange;
328
	struct kfd_process *p;
329 330 331 332

	prange = kzalloc(sizeof(*prange), GFP_KERNEL);
	if (!prange)
		return NULL;
333 334 335 336

	p = container_of(svms, struct kfd_process, svms);
	if (!p->xnack_enabled && update_mem_usage &&
	    amdgpu_amdkfd_reserve_mem_limit(NULL, size << PAGE_SHIFT,
337
				    KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0)) {
338 339 340 341
		pr_info("SVM mapping failed, exceeds resident system memory limit\n");
		kfree(prange);
		return NULL;
	}
342 343 344 345 346 347
	prange->npages = size;
	prange->svms = svms;
	prange->start = start;
	prange->last = last;
	INIT_LIST_HEAD(&prange->list);
	INIT_LIST_HEAD(&prange->update_list);
348
	INIT_LIST_HEAD(&prange->svm_bo_list);
349 350
	INIT_LIST_HEAD(&prange->deferred_list);
	INIT_LIST_HEAD(&prange->child_list);
351
	atomic_set(&prange->invalid, 0);
352
	prange->validate_timestamp = 0;
353
	prange->vram_pages = 0;
354
	mutex_init(&prange->migrate_mutex);
355
	mutex_init(&prange->lock);
356 357

	if (p->xnack_enabled)
358 359
		bitmap_copy(prange->bitmap_access, svms->bitmap_supported,
			    MAX_GPU_INSTANCE);
360

361 362 363 364 365 366 367 368 369
	svm_range_set_default_attributes(&prange->preferred_loc,
					 &prange->prefetch_loc,
					 &prange->granularity, &prange->flags);

	pr_debug("svms 0x%p [0x%llx 0x%llx]\n", svms, start, last);

	return prange;
}

370 371 372 373 374 375 376 377 378 379 380 381 382
static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo)
{
	if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref))
		return false;

	return true;
}

static void svm_range_bo_release(struct kref *kref)
{
	struct svm_range_bo *svm_bo;

	svm_bo = container_of(kref, struct svm_range_bo, kref);
383 384
	pr_debug("svm_bo 0x%p\n", svm_bo);

385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
	spin_lock(&svm_bo->list_lock);
	while (!list_empty(&svm_bo->range_list)) {
		struct svm_range *prange =
				list_first_entry(&svm_bo->range_list,
						struct svm_range, svm_bo_list);
		/* list_del_init tells a concurrent svm_range_vram_node_new when
		 * it's safe to reuse the svm_bo pointer and svm_bo_list head.
		 */
		list_del_init(&prange->svm_bo_list);
		spin_unlock(&svm_bo->list_lock);

		pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
			 prange->start, prange->last);
		mutex_lock(&prange->lock);
		prange->svm_bo = NULL;
400 401
		/* prange should not hold vram page now */
		WARN_ONCE(prange->actual_loc, "prange should not hold vram page");
402 403 404 405 406
		mutex_unlock(&prange->lock);

		spin_lock(&svm_bo->list_lock);
	}
	spin_unlock(&svm_bo->list_lock);
407 408
	if (!dma_fence_is_signaled(&svm_bo->eviction_fence->base))
		/* We're not in the eviction worker. Signal the fence. */
409 410
		dma_fence_signal(&svm_bo->eviction_fence->base);
	dma_fence_put(&svm_bo->eviction_fence->base);
411 412 413 414
	amdgpu_bo_unref(&svm_bo->bo);
	kfree(svm_bo);
}

415
static void svm_range_bo_wq_release(struct work_struct *work)
416
{
417 418 419 420 421 422 423 424 425
	struct svm_range_bo *svm_bo;

	svm_bo = container_of(work, struct svm_range_bo, release_work);
	svm_range_bo_release(&svm_bo->kref);
}

static void svm_range_bo_release_async(struct kref *kref)
{
	struct svm_range_bo *svm_bo;
426

427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
	svm_bo = container_of(kref, struct svm_range_bo, kref);
	pr_debug("svm_bo 0x%p\n", svm_bo);
	INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release);
	schedule_work(&svm_bo->release_work);
}

void svm_range_bo_unref_async(struct svm_range_bo *svm_bo)
{
	kref_put(&svm_bo->kref, svm_range_bo_release_async);
}

static void svm_range_bo_unref(struct svm_range_bo *svm_bo)
{
	if (svm_bo)
		kref_put(&svm_bo->kref, svm_range_bo_release);
442 443
}

444
static bool
445
svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange)
446 447 448 449 450 451 452 453 454 455 456 457
{
	mutex_lock(&prange->lock);
	if (!prange->svm_bo) {
		mutex_unlock(&prange->lock);
		return false;
	}
	if (prange->ttm_res) {
		/* We still have a reference, all is well */
		mutex_unlock(&prange->lock);
		return true;
	}
	if (svm_bo_ref_unless_zero(prange->svm_bo)) {
458
		/*
459 460 461
		 * Migrate from GPU to GPU, remove range from source svm_bo->node
		 * range list, and return false to allocate svm_bo from destination
		 * node.
462
		 */
463
		if (prange->svm_bo->node != node) {
464 465 466 467 468 469 470 471 472
			mutex_unlock(&prange->lock);

			spin_lock(&prange->svm_bo->list_lock);
			list_del_init(&prange->svm_bo_list);
			spin_unlock(&prange->svm_bo->list_lock);

			svm_range_bo_unref(prange->svm_bo);
			return false;
		}
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495
		if (READ_ONCE(prange->svm_bo->evicting)) {
			struct dma_fence *f;
			struct svm_range_bo *svm_bo;
			/* The BO is getting evicted,
			 * we need to get a new one
			 */
			mutex_unlock(&prange->lock);
			svm_bo = prange->svm_bo;
			f = dma_fence_get(&svm_bo->eviction_fence->base);
			svm_range_bo_unref(prange->svm_bo);
			/* wait for the fence to avoid long spin-loop
			 * at list_empty_careful
			 */
			dma_fence_wait(f, false);
			dma_fence_put(f);
		} else {
			/* The BO was still around and we got
			 * a new reference to it
			 */
			mutex_unlock(&prange->lock);
			pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n",
				 prange->svms, prange->start, prange->last);

496
			prange->ttm_res = prange->svm_bo->bo->tbo.resource;
497 498 499 500 501 502 503 504 505
			return true;
		}

	} else {
		mutex_unlock(&prange->lock);
	}

	/* We need a new svm_bo. Spin-loop to wait for concurrent
	 * svm_range_bo_release to finish removing this range from
506 507
	 * its range list and set prange->svm_bo to null. After this,
	 * it is safe to reuse the svm_bo pointer and svm_bo_list head.
508
	 */
509 510
	while (!list_empty_careful(&prange->svm_bo_list) || prange->svm_bo)
		cond_resched();
511 512 513 514

	return false;
}

515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
static struct svm_range_bo *svm_range_bo_new(void)
{
	struct svm_range_bo *svm_bo;

	svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL);
	if (!svm_bo)
		return NULL;

	kref_init(&svm_bo->kref);
	INIT_LIST_HEAD(&svm_bo->range_list);
	spin_lock_init(&svm_bo->list_lock);

	return svm_bo;
}

int
531
svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
532 533 534 535 536 537 538
			bool clear)
{
	struct amdgpu_bo_param bp;
	struct svm_range_bo *svm_bo;
	struct amdgpu_bo_user *ubo;
	struct amdgpu_bo *bo;
	struct kfd_process *p;
539
	struct mm_struct *mm;
540 541
	int r;

542 543 544
	p = container_of(prange->svms, struct kfd_process, svms);
	pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, prange->svms,
		 prange->start, prange->last);
545

546
	if (svm_range_validate_svm_bo(node, prange))
547
		return 0;
548 549 550 551 552 553

	svm_bo = svm_range_bo_new();
	if (!svm_bo) {
		pr_debug("failed to alloc svm bo\n");
		return -ENOMEM;
	}
554 555 556 557 558 559
	mm = get_task_mm(p->lead_thread);
	if (!mm) {
		pr_debug("failed to get mm\n");
		kfree(svm_bo);
		return -ESRCH;
	}
560
	svm_bo->node = node;
561 562 563 564 565 566 567
	svm_bo->eviction_fence =
		amdgpu_amdkfd_fence_create(dma_fence_context_alloc(1),
					   mm,
					   svm_bo);
	mmput(mm);
	INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker);
	svm_bo->evicting = 0;
568 569 570 571 572 573
	memset(&bp, 0, sizeof(bp));
	bp.size = prange->npages * PAGE_SIZE;
	bp.byte_align = PAGE_SIZE;
	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
	bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
	bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0;
574
	bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE;
575 576
	bp.type = ttm_bo_type_device;
	bp.resv = NULL;
577
	if (node->xcp)
578
		bp.xcp_id_plus1 = node->xcp->id + 1;
579

580
	r = amdgpu_bo_create_user(node->adev, &bp, &ubo);
581 582
	if (r) {
		pr_debug("failed %d to create bo\n", r);
583
		goto create_bo_failed;
584 585
	}
	bo = &ubo->bo;
586 587 588

	pr_debug("alloc bo at offset 0x%lx size 0x%lx on partition %d\n",
		 bo->tbo.resource->start << PAGE_SHIFT, bp.size,
589
		 bp.xcp_id_plus1 - 1);
590

591 592 593 594 595 596
	r = amdgpu_bo_reserve(bo, true);
	if (r) {
		pr_debug("failed %d to reserve bo\n", r);
		goto reserve_bo_failed;
	}

597 598 599 600 601 602 603 604 605
	if (clear) {
		r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);
		if (r) {
			pr_debug("failed %d to sync bo\n", r);
			amdgpu_bo_unreserve(bo);
			goto reserve_bo_failed;
		}
	}

606
	r = dma_resv_reserve_fences(bo->tbo.base.resv, 1);
607 608 609 610 611
	if (r) {
		pr_debug("failed %d to reserve bo\n", r);
		amdgpu_bo_unreserve(bo);
		goto reserve_bo_failed;
	}
612
	amdgpu_bo_fence(bo, &svm_bo->eviction_fence->base, true);
613 614 615 616 617

	amdgpu_bo_unreserve(bo);

	svm_bo->bo = bo;
	prange->svm_bo = svm_bo;
618
	prange->ttm_res = bo->tbo.resource;
619 620 621 622 623 624 625 626 627 628
	prange->offset = 0;

	spin_lock(&svm_bo->list_lock);
	list_add(&prange->svm_bo_list, &svm_bo->range_list);
	spin_unlock(&svm_bo->list_lock);

	return 0;

reserve_bo_failed:
	amdgpu_bo_unref(&bo);
629 630 631
create_bo_failed:
	dma_fence_put(&svm_bo->eviction_fence->base);
	kfree(svm_bo);
632 633 634 635 636 637 638
	prange->ttm_res = NULL;

	return r;
}

void svm_range_vram_node_free(struct svm_range *prange)
{
639 640 641 642 643 644 645 646 647
	/* serialize prange->svm_bo unref */
	mutex_lock(&prange->lock);
	/* prange->svm_bo has not been unref */
	if (prange->ttm_res) {
		prange->ttm_res = NULL;
		mutex_unlock(&prange->lock);
		svm_range_bo_unref(prange->svm_bo);
	} else
		mutex_unlock(&prange->lock);
648 649
}

650 651
struct kfd_node *
svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id)
652 653
{
	struct kfd_process *p;
654
	struct kfd_process_device *pdd;
655 656

	p = container_of(prange->svms, struct kfd_process, svms);
657
	pdd = kfd_process_device_data_by_id(p, gpu_id);
658
	if (!pdd) {
659
		pr_debug("failed to get kfd process device by id 0x%x\n", gpu_id);
660 661 662
		return NULL;
	}

663
	return pdd->dev;
664 665
}

666
struct kfd_process_device *
667
svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node)
668 669 670 671 672
{
	struct kfd_process *p;

	p = container_of(prange->svms, struct kfd_process, svms);

673
	return kfd_get_process_device_data(node, p);
674 675
}

676 677 678 679 680 681 682 683 684
static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo)
{
	struct ttm_operation_ctx ctx = { false, false };

	amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_VRAM);

	return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
}

685 686 687 688 689 690 691
static int
svm_range_check_attr(struct kfd_process *p,
		     uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
{
	uint32_t i;

	for (i = 0; i < nattr; i++) {
692 693 694
		uint32_t val = attrs[i].value;
		int gpuidx = MAX_GPU_INSTANCE;

695 696
		switch (attrs[i].type) {
		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
697 698 699
			if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM &&
			    val != KFD_IOCTL_SVM_LOCATION_UNDEFINED)
				gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
700 701
			break;
		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
702 703
			if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM)
				gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
704 705 706 707
			break;
		case KFD_IOCTL_SVM_ATTR_ACCESS:
		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
708
			gpuidx = kfd_process_gpuidx_from_gpuid(p, val);
709 710 711 712 713 714 715 716 717 718 719
			break;
		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
			break;
		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
			break;
		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
			break;
		default:
			pr_debug("unknown attr type 0x%x\n", attrs[i].type);
			return -EINVAL;
		}
720 721 722 723 724 725 726 727 728

		if (gpuidx < 0) {
			pr_debug("no GPU 0x%x found\n", val);
			return -EINVAL;
		} else if (gpuidx < MAX_GPU_INSTANCE &&
			   !test_bit(gpuidx, p->svms.bitmap_supported)) {
			pr_debug("GPU 0x%x not supported\n", val);
			return -EINVAL;
		}
729 730 731 732 733 734 735
	}

	return 0;
}

static void
svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange,
736 737
		      uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
		      bool *update_mapping)
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752
{
	uint32_t i;
	int gpuidx;

	for (i = 0; i < nattr; i++) {
		switch (attrs[i].type) {
		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
			prange->preferred_loc = attrs[i].value;
			break;
		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
			prange->prefetch_loc = attrs[i].value;
			break;
		case KFD_IOCTL_SVM_ATTR_ACCESS:
		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
753 754 755
			if (!p->xnack_enabled)
				*update_mapping = true;

756 757 758 759 760 761 762 763 764 765 766 767 768 769
			gpuidx = kfd_process_gpuidx_from_gpuid(p,
							       attrs[i].value);
			if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
				bitmap_clear(prange->bitmap_access, gpuidx, 1);
				bitmap_clear(prange->bitmap_aip, gpuidx, 1);
			} else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
				bitmap_set(prange->bitmap_access, gpuidx, 1);
				bitmap_clear(prange->bitmap_aip, gpuidx, 1);
			} else {
				bitmap_clear(prange->bitmap_access, gpuidx, 1);
				bitmap_set(prange->bitmap_aip, gpuidx, 1);
			}
			break;
		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
770
			*update_mapping = true;
771 772 773
			prange->flags |= attrs[i].value;
			break;
		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
774
			*update_mapping = true;
775 776 777
			prange->flags &= ~attrs[i].value;
			break;
		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
778
			prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F);
779 780 781 782 783 784 785
			break;
		default:
			WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
		}
	}
}

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837
static bool
svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
			uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs)
{
	uint32_t i;
	int gpuidx;

	for (i = 0; i < nattr; i++) {
		switch (attrs[i].type) {
		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
			if (prange->preferred_loc != attrs[i].value)
				return false;
			break;
		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
			/* Prefetch should always trigger a migration even
			 * if the value of the attribute didn't change.
			 */
			return false;
		case KFD_IOCTL_SVM_ATTR_ACCESS:
		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
			gpuidx = kfd_process_gpuidx_from_gpuid(p,
							       attrs[i].value);
			if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) {
				if (test_bit(gpuidx, prange->bitmap_access) ||
				    test_bit(gpuidx, prange->bitmap_aip))
					return false;
			} else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) {
				if (!test_bit(gpuidx, prange->bitmap_access))
					return false;
			} else {
				if (!test_bit(gpuidx, prange->bitmap_aip))
					return false;
			}
			break;
		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
			if ((prange->flags & attrs[i].value) != attrs[i].value)
				return false;
			break;
		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
			if ((prange->flags & attrs[i].value) != 0)
				return false;
			break;
		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
			if (prange->granularity != attrs[i].value)
				return false;
			break;
		default:
			WARN_ONCE(1, "svm_range_check_attrs wasn't called?");
		}
	}

838
	return true;
839 840
}

841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
/**
 * svm_range_debug_dump - print all range information from svms
 * @svms: svm range list header
 *
 * debug output svm range start, end, prefetch location from svms
 * interval tree and link list
 *
 * Context: The caller must hold svms->lock
 */
static void svm_range_debug_dump(struct svm_range_list *svms)
{
	struct interval_tree_node *node;
	struct svm_range *prange;

	pr_debug("dump svms 0x%p list\n", svms);
	pr_debug("range\tstart\tpage\tend\t\tlocation\n");

	list_for_each_entry(prange, &svms->list, list) {
		pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
			 prange, prange->start, prange->npages,
			 prange->start + prange->npages - 1,
			 prange->actual_loc);
	}

	pr_debug("dump svms 0x%p interval tree\n", svms);
	pr_debug("range\tstart\tpage\tend\t\tlocation\n");
	node = interval_tree_iter_first(&svms->objects, 0, ~0ULL);
	while (node) {
		prange = container_of(node, struct svm_range, it_node);
		pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n",
			 prange, prange->start, prange->npages,
			 prange->start + prange->npages - 1,
			 prange->actual_loc);
		node = interval_tree_iter_next(node, 0, ~0ULL);
	}
}

878 879
static void *
svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements,
880
		     uint64_t offset, uint64_t *vram_pages)
881
{
882
	unsigned char *src = (unsigned char *)psrc + offset;
883
	unsigned char *dst;
884
	uint64_t i;
885 886 887 888

	dst = kvmalloc_array(num_elements, size, GFP_KERNEL);
	if (!dst)
		return NULL;
889 890 891 892 893 894 895 896 897 898 899 900 901 902

	if (!vram_pages) {
		memcpy(dst, src, num_elements * size);
		return (void *)dst;
	}

	*vram_pages = 0;
	for (i = 0; i < num_elements; i++) {
		dma_addr_t *temp;
		temp = (dma_addr_t *)dst + i;
		*temp = *((dma_addr_t *)src + i);
		if (*temp&SVM_RANGE_VRAM_DOMAIN)
			(*vram_pages)++;
	}
903 904 905 906 907 908 909 910 911 912 913 914 915

	return (void *)dst;
}

static int
svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src)
{
	int i;

	for (i = 0; i < MAX_GPU_INSTANCE; i++) {
		if (!src->dma_addr[i])
			continue;
		dst->dma_addr[i] = svm_range_copy_array(src->dma_addr[i],
916
					sizeof(*src->dma_addr[i]), src->npages, 0, NULL);
917 918 919 920 921 922 923
		if (!dst->dma_addr[i])
			return -ENOMEM;
	}

	return 0;
}

924 925 926
static int
svm_range_split_array(void *ppnew, void *ppold, size_t size,
		      uint64_t old_start, uint64_t old_n,
927
		      uint64_t new_start, uint64_t new_n, uint64_t *new_vram_pages)
928 929 930 931 932 933 934 935 936 937
{
	unsigned char *new, *old, *pold;
	uint64_t d;

	if (!ppold)
		return 0;
	pold = *(unsigned char **)ppold;
	if (!pold)
		return 0;

938
	d = (new_start - old_start) * size;
939 940
	/* get dma addr array for new range and calculte its vram page number */
	new = svm_range_copy_array(pold, size, new_n, d, new_vram_pages);
941 942
	if (!new)
		return -ENOMEM;
943
	d = (new_start == old_start) ? new_n * size : 0;
944
	old = svm_range_copy_array(pold, size, old_n, d, NULL);
945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965
	if (!old) {
		kvfree(new);
		return -ENOMEM;
	}
	kvfree(pold);
	*(void **)ppold = old;
	*(void **)ppnew = new;

	return 0;
}

static int
svm_range_split_pages(struct svm_range *new, struct svm_range *old,
		      uint64_t start, uint64_t last)
{
	uint64_t npages = last - start + 1;
	int i, r;

	for (i = 0; i < MAX_GPU_INSTANCE; i++) {
		r = svm_range_split_array(&new->dma_addr[i], &old->dma_addr[i],
					  sizeof(*old->dma_addr[i]), old->start,
966 967
					  npages, new->start, new->npages,
					  old->actual_loc ? &new->vram_pages : NULL);
968 969 970
		if (r)
			return r;
	}
971 972
	if (old->actual_loc)
		old->vram_pages -= new->vram_pages;
973 974 975 976

	return 0;
}

977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
static int
svm_range_split_nodes(struct svm_range *new, struct svm_range *old,
		      uint64_t start, uint64_t last)
{
	uint64_t npages = last - start + 1;

	pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n",
		 new->svms, new, new->start, start, last);

	if (new->start == old->start) {
		new->offset = old->offset;
		old->offset += new->npages;
	} else {
		new->offset = old->offset + npages;
	}

	new->svm_bo = svm_range_bo_ref(old->svm_bo);
	new->ttm_res = old->ttm_res;

	spin_lock(&new->svm_bo->list_lock);
	list_add(&new->svm_bo_list, &new->svm_bo->range_list);
	spin_unlock(&new->svm_bo->list_lock);

	return 0;
}

1003 1004 1005 1006 1007 1008 1009 1010
/**
 * svm_range_split_adjust - split range and adjust
 *
 * @new: new range
 * @old: the old range
 * @start: the old range adjust to start address in pages
 * @last: the old range adjust to last address in pages
 *
1011
 * Copy system memory dma_addr or vram ttm_res in old range to new
1012 1013 1014 1015 1016 1017 1018 1019 1020 1021
 * range from new_start up to size new->npages, the remaining old range is from
 * start to last
 *
 * Return:
 * 0 - OK, -ENOMEM - out of memory
 */
static int
svm_range_split_adjust(struct svm_range *new, struct svm_range *old,
		      uint64_t start, uint64_t last)
{
1022 1023
	int r;

1024 1025 1026 1027 1028 1029 1030 1031 1032
	pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n",
		 new->svms, new->start, old->start, old->last, start, last);

	if (new->start < old->start ||
	    new->last > old->last) {
		WARN_ONCE(1, "invalid new range start or last\n");
		return -EINVAL;
	}

1033 1034 1035 1036
	r = svm_range_split_pages(new, old, start, last);
	if (r)
		return r;

1037 1038 1039 1040 1041 1042
	if (old->actual_loc && old->ttm_res) {
		r = svm_range_split_nodes(new, old, start, last);
		if (r)
			return r;
	}

1043 1044 1045 1046 1047 1048 1049 1050
	old->npages = last - start + 1;
	old->start = start;
	old->last = last;
	new->flags = old->flags;
	new->preferred_loc = old->preferred_loc;
	new->prefetch_loc = old->prefetch_loc;
	new->actual_loc = old->actual_loc;
	new->granularity = old->granularity;
1051
	new->mapped_to_gpu = old->mapped_to_gpu;
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);

	return 0;
}

/**
 * svm_range_split - split a range in 2 ranges
 *
 * @prange: the svm range to split
 * @start: the remaining range start address in pages
 * @last: the remaining range last address in pages
 * @new: the result new range generated
 *
 * Two cases only:
 * case 1: if start == prange->start
 *         prange ==> prange[start, last]
 *         new range [last + 1, prange->last]
 *
 * case 2: if last == prange->last
 *         prange ==> prange[start, last]
 *         new range [prange->start, start - 1]
 *
 * Return:
 * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last
 */
static int
svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last,
		struct svm_range **new)
{
	uint64_t old_start = prange->start;
	uint64_t old_last = prange->last;
	struct svm_range_list *svms;
	int r = 0;

	pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n", prange->svms,
		 old_start, old_last, start, last);

	if (old_start != start && old_last != last)
		return -EINVAL;
	if (start < old_start || last > old_last)
		return -EINVAL;

	svms = prange->svms;
	if (old_start == start)
1097
		*new = svm_range_new(svms, last + 1, old_last, false);
1098
	else
1099
		*new = svm_range_new(svms, old_start, start - 1, false);
1100 1101 1102 1103 1104 1105 1106
	if (!*new)
		return -ENOMEM;

	r = svm_range_split_adjust(*new, prange, start, last);
	if (r) {
		pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n",
			 r, old_start, old_last, start, last);
1107
		svm_range_free(*new, false);
1108 1109 1110 1111 1112 1113 1114
		*new = NULL;
	}

	return r;
}

static int
1115 1116
svm_range_split_tail(struct svm_range *prange, uint64_t new_last,
		     struct list_head *insert_list, struct list_head *remap_list)
1117
{
1118
	struct svm_range *tail = NULL;
1119 1120
	int r = svm_range_split(prange, prange->start, new_last, &tail);

1121
	if (!r) {
1122
		list_add(&tail->list, insert_list);
1123 1124 1125
		if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity))
			list_add(&tail->update_list, remap_list);
	}
1126 1127 1128 1129
	return r;
}

static int
1130 1131
svm_range_split_head(struct svm_range *prange, uint64_t new_start,
		     struct list_head *insert_list, struct list_head *remap_list)
1132
{
1133
	struct svm_range *head = NULL;
1134 1135
	int r = svm_range_split(prange, new_start, prange->last, &head);

1136
	if (!r) {
1137
		list_add(&head->list, insert_list);
1138 1139 1140
		if (!IS_ALIGNED(new_start, 1UL << prange->granularity))
			list_add(&head->update_list, remap_list);
	}
1141 1142 1143
	return r;
}

1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
static void
svm_range_add_child(struct svm_range *prange, struct mm_struct *mm,
		    struct svm_range *pchild, enum svm_work_list_ops op)
{
	pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n",
		 pchild, pchild->start, pchild->last, prange, op);

	pchild->work_item.mm = mm;
	pchild->work_item.op = op;
	list_add_tail(&pchild->child_list, &prange->child_list);
}

1156 1157 1158 1159 1160 1161
static bool
svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b)
{
	return (node_a->adev == node_b->adev ||
		amdgpu_xgmi_same_hive(node_a->adev, node_b->adev));
}
1162

1163
static uint64_t
1164 1165
svm_range_get_pte_flags(struct kfd_node *node,
			struct svm_range *prange, int domain)
1166
{
1167
	struct kfd_node *bo_node;
1168
	uint32_t flags = prange->flags;
1169
	uint32_t mapping_flags = 0;
1170
	uint64_t pte_flags;
1171
	bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
1172 1173
	bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
	bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
1174
	bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/
1175
	unsigned int mtype_local;
1176

1177
	if (domain == SVM_RANGE_VRAM_DOMAIN)
1178
		bo_node = prange->svm_bo->node;
1179

1180
	switch (amdgpu_ip_version(node->adev, GC_HWIP, 0)) {
1181
	case IP_VERSION(9, 4, 1):
1182
		if (domain == SVM_RANGE_VRAM_DOMAIN) {
1183
			if (bo_node == node) {
1184 1185 1186
				mapping_flags |= coherent ?
					AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
			} else {
1187 1188
				mapping_flags |= coherent ?
					AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1189
				if (svm_nodes_in_same_hive(node, bo_node))
1190 1191 1192 1193 1194 1195 1196
					snoop = true;
			}
		} else {
			mapping_flags |= coherent ?
				AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
		}
		break;
1197
	case IP_VERSION(9, 4, 2):
1198
		if (domain == SVM_RANGE_VRAM_DOMAIN) {
1199
			if (bo_node == node) {
1200 1201
				mapping_flags |= coherent ?
					AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1202
				if (node->adev->gmc.xgmi.connected_to_cpu)
1203 1204
					snoop = true;
			} else {
1205 1206
				mapping_flags |= coherent ?
					AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1207
				if (svm_nodes_in_same_hive(node, bo_node))
1208 1209 1210 1211 1212 1213 1214
					snoop = true;
			}
		} else {
			mapping_flags |= coherent ?
				AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
		}
		break;
1215
	case IP_VERSION(9, 4, 3):
1216 1217 1218 1219 1220
		if (ext_coherent)
			mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_UC;
		else
			mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC :
				amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW;
1221
		snoop = true;
1222
		if (uncached) {
1223
			mapping_flags |= AMDGPU_VM_MTYPE_UC;
1224
		} else if (domain == SVM_RANGE_VRAM_DOMAIN) {
1225
			/* local HBM region close to partition */
1226 1227
			if (bo_node->adev == node->adev &&
			    (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id))
1228
				mapping_flags |= mtype_local;
1229 1230 1231 1232
			/* local HBM region far from partition or remote XGMI GPU
			 * with regular system scope coherence
			 */
			else if (svm_nodes_in_same_hive(bo_node, node) && !ext_coherent)
1233
				mapping_flags |= AMDGPU_VM_MTYPE_NC;
1234
			/* PCIe P2P or extended system scope coherence */
1235 1236 1237 1238
			else
				mapping_flags |= AMDGPU_VM_MTYPE_UC;
		/* system memory accessed by the APU */
		} else if (node->adev->flags & AMD_IS_APU) {
1239 1240 1241 1242 1243 1244
			/* On NUMA systems, locality is determined per-page
			 * in amdgpu_gmc_override_vm_pte_flags
			 */
			if (num_possible_nodes() <= 1)
				mapping_flags |= mtype_local;
			else
1245
				mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
1246 1247 1248 1249
		/* system memory accessed by the dGPU */
		} else {
			mapping_flags |= AMDGPU_VM_MTYPE_UC;
		}
1250
		break;
1251 1252 1253 1254
	default:
		mapping_flags |= coherent ?
			AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
	}
1255

1256
	mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
1257 1258 1259 1260 1261 1262

	if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO)
		mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE;
	if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC)
		mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE;

1263
	pte_flags = AMDGPU_PTE_VALID;
1264
	pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM;
1265
	pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0;
1266

1267
	pte_flags |= amdgpu_gem_va_map_flags(node->adev, mapping_flags);
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
	return pte_flags;
}

static int
svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
			 uint64_t start, uint64_t last,
			 struct dma_fence **fence)
{
	uint64_t init_pte_value = 0;

	pr_debug("[0x%llx 0x%llx]\n", start, last);

1280
	return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start,
1281 1282
				      last, init_pte_value, 0, 0, NULL, NULL,
				      fence);
1283 1284 1285 1286
}

static int
svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start,
1287
			  unsigned long last, uint32_t trigger)
1288 1289 1290 1291 1292 1293 1294 1295
{
	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
	struct kfd_process_device *pdd;
	struct dma_fence *fence = NULL;
	struct kfd_process *p;
	uint32_t gpuidx;
	int r = 0;

1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
	if (!prange->mapped_to_gpu) {
		pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n",
			 prange, prange->start, prange->last);
		return 0;
	}

	if (prange->start == start && prange->last == last) {
		pr_debug("unmap svms 0x%p prange 0x%p\n", prange->svms, prange);
		prange->mapped_to_gpu = false;
	}

1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318
	bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
		  MAX_GPU_INSTANCE);
	p = container_of(prange->svms, struct kfd_process, svms);

	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
		pr_debug("unmap from gpu idx 0x%x\n", gpuidx);
		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
		if (!pdd) {
			pr_debug("failed to find device idx %d\n", gpuidx);
			return -EINVAL;
		}

1319
		kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread->pid,
1320 1321
					     start, last, trigger);

1322 1323
		r = svm_range_unmap_from_gpu(pdd->dev->adev,
					     drm_priv_to_vm(pdd->drm_priv),
1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334
					     start, last, &fence);
		if (r)
			break;

		if (fence) {
			r = dma_fence_wait(fence, false);
			dma_fence_put(fence);
			fence = NULL;
			if (r)
				break;
		}
1335
		kfd_flush_tlb(pdd, TLB_FLUSH_HEAVYWEIGHT);
1336 1337 1338 1339 1340 1341
	}

	return r;
}

static int
1342 1343 1344
svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
		     unsigned long offset, unsigned long npages, bool readonly,
		     dma_addr_t *dma_addr, struct amdgpu_device *bo_adev,
1345
		     struct dma_fence **fence, bool flush_tlb)
1346
{
1347 1348
	struct amdgpu_device *adev = pdd->dev->adev;
	struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
1349
	uint64_t pte_flags;
1350 1351
	unsigned long last_start;
	int last_domain;
1352
	int r = 0;
1353
	int64_t i, j;
1354

1355 1356 1357 1358
	last_start = prange->start + offset;

	pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n", prange->svms,
		 last_start, last_start + npages - 1, readonly);
1359

1360
	for (i = offset; i < offset + npages; i++) {
1361 1362
		last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN;
		dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN;
1363 1364 1365 1366 1367

		/* Collect all pages in the same address range and memory domain
		 * that can be mapped with a single call to update mapping.
		 */
		if (i < offset + npages - 1 &&
1368 1369 1370 1371 1372
		    last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN))
			continue;

		pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n",
			 last_start, prange->start + i, last_domain ? "GPU" : "CPU");
1373

1374
		pte_flags = svm_range_get_pte_flags(pdd->dev, prange, last_domain);
1375 1376 1377 1378 1379 1380 1381 1382
		if (readonly)
			pte_flags &= ~AMDGPU_PTE_WRITEABLE;

		pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n",
			 prange->svms, last_start, prange->start + i,
			 (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0,
			 pte_flags);

1383 1384 1385
		/* For dGPU mode, we use same vm_manager to allocate VRAM for
		 * different memory partition based on fpfn/lpfn, we should use
		 * same vm_manager.vram_base_offset regardless memory partition.
1386
		 */
1387 1388
		r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true,
					   NULL, last_start, prange->start + i,
1389
					   pte_flags,
1390
					   (last_start - prange->start) << PAGE_SHIFT,
1391
					   bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
1392
					   NULL, dma_addr, &vm->last_update);
1393 1394 1395 1396

		for (j = last_start - prange->start; j <= i; j++)
			dma_addr[j] |= last_domain;

1397 1398 1399 1400 1401
		if (r) {
			pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start);
			goto out;
		}
		last_start = prange->start + i + 1;
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417
	}

	r = amdgpu_vm_update_pdes(adev, vm, false);
	if (r) {
		pr_debug("failed %d to update directories 0x%lx\n", r,
			 prange->start);
		goto out;
	}

	if (fence)
		*fence = dma_fence_get(vm->last_update);

out:
	return r;
}

1418 1419 1420
static int
svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
		      unsigned long npages, bool readonly,
1421
		      unsigned long *bitmap, bool wait, bool flush_tlb)
1422 1423
{
	struct kfd_process_device *pdd;
1424
	struct amdgpu_device *bo_adev = NULL;
1425 1426 1427 1428 1429
	struct kfd_process *p;
	struct dma_fence *fence = NULL;
	uint32_t gpuidx;
	int r = 0;

1430
	if (prange->svm_bo && prange->ttm_res)
1431
		bo_adev = prange->svm_bo->node->adev;
1432

1433 1434
	p = container_of(prange->svms, struct kfd_process, svms);
	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
1435
		pr_debug("mapping to gpu idx 0x%x\n", gpuidx);
1436 1437 1438 1439 1440 1441 1442 1443 1444 1445
		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
		if (!pdd) {
			pr_debug("failed to find device idx %d\n", gpuidx);
			return -EINVAL;
		}

		pdd = kfd_bind_process_to_device(pdd->dev, p);
		if (IS_ERR(pdd))
			return -EINVAL;

1446 1447
		if (bo_adev && pdd->dev->adev != bo_adev &&
		    !amdgpu_xgmi_same_hive(pdd->dev->adev, bo_adev)) {
1448 1449 1450 1451
			pr_debug("cannot map to device idx %d\n", gpuidx);
			continue;
		}

1452
		r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
1453
					 prange->dma_addr[gpuidx],
1454 1455
					 bo_adev, wait ? &fence : NULL,
					 flush_tlb);
1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467
		if (r)
			break;

		if (fence) {
			r = dma_fence_wait(fence, false);
			dma_fence_put(fence);
			fence = NULL;
			if (r) {
				pr_debug("failed %d to dma fence wait\n", r);
				break;
			}
		}
1468 1469

		kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
1470 1471 1472 1473 1474 1475 1476 1477 1478
	}

	return r;
}

struct svm_validate_context {
	struct kfd_process *process;
	struct svm_range *prange;
	bool intr;
1479
	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
1480
	struct drm_exec exec;
1481 1482
};

1483
static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr)
1484 1485 1486 1487 1488 1489
{
	struct kfd_process_device *pdd;
	struct amdgpu_vm *vm;
	uint32_t gpuidx;
	int r;

1490
	drm_exec_init(&ctx->exec, intr ? DRM_EXEC_INTERRUPTIBLE_WAIT: 0, 0);
1491 1492 1493 1494 1495 1496 1497 1498 1499
	drm_exec_until_all_locked(&ctx->exec) {
		for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
			pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
			if (!pdd) {
				pr_debug("failed to find device idx %d\n", gpuidx);
				r = -EINVAL;
				goto unreserve_out;
			}
			vm = drm_priv_to_vm(pdd->drm_priv);
1500

1501 1502 1503 1504 1505 1506 1507
			r = amdgpu_vm_lock_pd(vm, &ctx->exec, 2);
			drm_exec_retry_on_contention(&ctx->exec);
			if (unlikely(r)) {
				pr_debug("failed %d to reserve bo\n", r);
				goto unreserve_out;
			}
		}
1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
	}

	for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) {
		pdd = kfd_process_device_from_gpuidx(ctx->process, gpuidx);
		if (!pdd) {
			pr_debug("failed to find device idx %d\n", gpuidx);
			r = -EINVAL;
			goto unreserve_out;
		}

1518 1519 1520
		r = amdgpu_vm_validate(pdd->dev->adev,
				       drm_priv_to_vm(pdd->drm_priv), NULL,
				       svm_range_bo_validate, NULL);
1521 1522 1523 1524 1525 1526 1527 1528 1529
		if (r) {
			pr_debug("failed %d validate pt bos\n", r);
			goto unreserve_out;
		}
	}

	return 0;

unreserve_out:
1530
	drm_exec_fini(&ctx->exec);
1531 1532 1533 1534 1535
	return r;
}

static void svm_range_unreserve_bos(struct svm_validate_context *ctx)
{
1536
	drm_exec_fini(&ctx->exec);
1537 1538
}

1539 1540 1541 1542 1543
static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx)
{
	struct kfd_process_device *pdd;

	pdd = kfd_process_device_from_gpuidx(p, gpuidx);
1544 1545
	if (!pdd)
		return NULL;
1546

1547
	return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev);
1548 1549
}

1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574
/*
 * Validation+GPU mapping with concurrent invalidation (MMU notifiers)
 *
 * To prevent concurrent destruction or change of range attributes, the
 * svm_read_lock must be held. The caller must not hold the svm_write_lock
 * because that would block concurrent evictions and lead to deadlocks. To
 * serialize concurrent migrations or validations of the same range, the
 * prange->migrate_mutex must be held.
 *
 * For VRAM ranges, the SVM BO must be allocated and valid (protected by its
 * eviction fence.
 *
 * The following sequence ensures race-free validation and GPU mapping:
 *
 * 1. Reserve page table (and SVM BO if range is in VRAM)
 * 2. hmm_range_fault to get page addresses (if system memory)
 * 3. DMA-map pages (if system memory)
 * 4-a. Take notifier lock
 * 4-b. Check that pages still valid (mmu_interval_read_retry)
 * 4-c. Check that the range was not split or otherwise invalidated
 * 4-d. Update GPU page table
 * 4.e. Release notifier lock
 * 5. Release page table (and SVM BO) reservation
 */
static int svm_range_validate_and_map(struct mm_struct *mm,
1575
				      unsigned long map_start, unsigned long map_last,
1576 1577
				      struct svm_range *prange, int32_t gpuidx,
				      bool intr, bool wait, bool flush_tlb)
1578
{
1579
	struct svm_validate_context *ctx;
1580
	unsigned long start, end, addr;
1581 1582 1583
	struct kfd_process *p;
	void *owner;
	int32_t idx;
1584 1585
	int r = 0;

1586 1587 1588 1589 1590 1591
	ctx = kzalloc(sizeof(struct svm_validate_context), GFP_KERNEL);
	if (!ctx)
		return -ENOMEM;
	ctx->process = container_of(prange->svms, struct kfd_process, svms);
	ctx->prange = prange;
	ctx->intr = intr;
1592 1593

	if (gpuidx < MAX_GPU_INSTANCE) {
1594 1595 1596 1597
		bitmap_zero(ctx->bitmap, MAX_GPU_INSTANCE);
		bitmap_set(ctx->bitmap, gpuidx, 1);
	} else if (ctx->process->xnack_enabled) {
		bitmap_copy(ctx->bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
1598 1599 1600 1601 1602 1603

		/* If prefetch range to GPU, or GPU retry fault migrate range to
		 * GPU, which has ACCESS attribute to the range, create mapping
		 * on that GPU.
		 */
		if (prange->actual_loc) {
1604
			gpuidx = kfd_process_gpuidx_from_gpuid(ctx->process,
1605 1606 1607 1608
							prange->actual_loc);
			if (gpuidx < 0) {
				WARN_ONCE(1, "failed get device by id 0x%x\n",
					 prange->actual_loc);
1609 1610
				r = -EINVAL;
				goto free_ctx;
1611 1612
			}
			if (test_bit(gpuidx, prange->bitmap_access))
1613
				bitmap_set(ctx->bitmap, gpuidx, 1);
1614
		}
1615 1616 1617 1618 1619 1620 1621 1622 1623 1624

		/*
		 * If prange is already mapped or with always mapped flag,
		 * update mapping on GPUs with ACCESS attribute
		 */
		if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
			if (prange->mapped_to_gpu ||
			    prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)
				bitmap_copy(ctx->bitmap, prange->bitmap_access, MAX_GPU_INSTANCE);
		}
1625
	} else {
1626
		bitmap_or(ctx->bitmap, prange->bitmap_access,
1627 1628 1629
			  prange->bitmap_aip, MAX_GPU_INSTANCE);
	}

1630
	if (bitmap_empty(ctx->bitmap, MAX_GPU_INSTANCE)) {
1631 1632
		r = 0;
		goto free_ctx;
1633
	}
1634

1635 1636 1637 1638
	if (prange->actual_loc && !prange->ttm_res) {
		/* This should never happen. actual_loc gets set by
		 * svm_migrate_ram_to_vram after allocating a BO.
		 */
1639
		WARN_ONCE(1, "VRAM BO missing during validation\n");
1640 1641
		r = -EINVAL;
		goto free_ctx;
1642 1643
	}

1644 1645 1646
	r = svm_range_reserve_bos(ctx, intr);
	if (r)
		goto free_ctx;
1647

1648
	p = container_of(prange->svms, struct kfd_process, svms);
1649
	owner = kfd_svm_page_owner(p, find_first_bit(ctx->bitmap,
1650
						MAX_GPU_INSTANCE));
1651
	for_each_set_bit(idx, ctx->bitmap, MAX_GPU_INSTANCE) {
1652 1653 1654
		if (kfd_svm_page_owner(p, idx) != owner) {
			owner = NULL;
			break;
1655
		}
1656
	}
1657

1658 1659
	start = map_start << PAGE_SHIFT;
	end = (map_last + 1) << PAGE_SHIFT;
1660
	for (addr = start; !r && addr < end; ) {
1661
		struct hmm_range *hmm_range;
1662 1663
		unsigned long map_start_vma;
		unsigned long map_last_vma;
1664
		struct vm_area_struct *vma;
1665
		unsigned long next = 0;
1666 1667 1668 1669
		unsigned long offset;
		unsigned long npages;
		bool readonly;

1670
		vma = vma_lookup(mm, addr);
1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686
		if (vma) {
			readonly = !(vma->vm_flags & VM_WRITE);

			next = min(vma->vm_end, end);
			npages = (next - addr) >> PAGE_SHIFT;
			WRITE_ONCE(p->svms.faulting_task, current);
			r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
						       readonly, owner, NULL,
						       &hmm_range);
			WRITE_ONCE(p->svms.faulting_task, NULL);
			if (r) {
				pr_debug("failed %d to get svm range pages\n", r);
				if (r == -EBUSY)
					r = -EAGAIN;
			}
		} else {
1687 1688
			r = -EFAULT;
		}
1689

1690
		if (!r) {
1691
			offset = (addr >> PAGE_SHIFT) - prange->start;
1692
			r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
1693
					      hmm_range->hmm_pfns);
1694 1695
			if (r)
				pr_debug("failed %d to dma map range\n", r);
1696
		}
1697

1698
		svm_range_lock(prange);
1699
		if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
1700 1701 1702
			pr_debug("hmm update the range, need validate again\n");
			r = -EAGAIN;
		}
1703 1704

		if (!r && !list_empty(&prange->child_list)) {
1705 1706 1707 1708
			pr_debug("range split by unmap in parallel, validate again\n");
			r = -EAGAIN;
		}

1709 1710 1711 1712 1713 1714 1715 1716 1717 1718
		if (!r) {
			map_start_vma = max(map_start, prange->start + offset);
			map_last_vma = min(map_last, prange->start + offset + npages - 1);
			if (map_start_vma <= map_last_vma) {
				offset = map_start_vma - prange->start;
				npages = map_last_vma - map_start_vma + 1;
				r = svm_range_map_to_gpus(prange, offset, npages, readonly,
							  ctx->bitmap, wait, flush_tlb);
			}
		}
1719 1720 1721

		if (!r && next == end)
			prange->mapped_to_gpu = true;
1722

1723 1724 1725 1726 1727
		svm_range_unlock(prange);

		addr = next;
	}

1728
	svm_range_unreserve_bos(ctx);
1729
	if (!r)
1730
		prange->validate_timestamp = ktime_get_boottime();
1731

1732 1733 1734
free_ctx:
	kfree(ctx);

1735 1736 1737
	return r;
}

1738 1739 1740 1741 1742 1743 1744 1745 1746
/**
 * svm_range_list_lock_and_flush_work - flush pending deferred work
 *
 * @svms: the svm range list
 * @mm: the mm structure
 *
 * Context: Returns with mmap write lock held, pending deferred work flushed
 *
 */
1747
void
1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761
svm_range_list_lock_and_flush_work(struct svm_range_list *svms,
				   struct mm_struct *mm)
{
retry_flush_work:
	flush_work(&svms->deferred_list_work);
	mmap_write_lock(mm);

	if (list_empty(&svms->deferred_range_list))
		return;
	mmap_write_unlock(mm);
	pr_debug("retry flush\n");
	goto retry_flush_work;
}

1762 1763 1764
static void svm_range_restore_work(struct work_struct *work)
{
	struct delayed_work *dwork = to_delayed_work(work);
1765
	struct amdkfd_process_info *process_info;
1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781
	struct svm_range_list *svms;
	struct svm_range *prange;
	struct kfd_process *p;
	struct mm_struct *mm;
	int evicted_ranges;
	int invalid;
	int r;

	svms = container_of(dwork, struct svm_range_list, restore_work);
	evicted_ranges = atomic_read(&svms->evicted_ranges);
	if (!evicted_ranges)
		return;

	pr_debug("restore svm ranges\n");

	p = container_of(svms, struct kfd_process, svms);
1782
	process_info = p->kgd_process_info;
1783 1784 1785 1786 1787

	/* Keep mm reference when svm_range_validate_and_map ranges */
	mm = get_task_mm(p->lead_thread);
	if (!mm) {
		pr_debug("svms 0x%p process mm gone\n", svms);
1788
		return;
1789
	}
1790

1791
	mutex_lock(&process_info->lock);
1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805
	svm_range_list_lock_and_flush_work(svms, mm);
	mutex_lock(&svms->lock);

	evicted_ranges = atomic_read(&svms->evicted_ranges);

	list_for_each_entry(prange, &svms->list, list) {
		invalid = atomic_read(&prange->invalid);
		if (!invalid)
			continue;

		pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n",
			 prange->svms, prange, prange->start, prange->last,
			 invalid);

1806 1807 1808 1809 1810
		/*
		 * If range is migrating, wait for migration is done.
		 */
		mutex_lock(&prange->migrate_mutex);

1811 1812
		r = svm_range_validate_and_map(mm, prange->start, prange->last, prange,
					       MAX_GPU_INSTANCE, false, true, false);
1813
		if (r)
1814 1815
			pr_debug("failed %d to map 0x%lx to gpus\n", r,
				 prange->start);
1816 1817 1818 1819

		mutex_unlock(&prange->migrate_mutex);
		if (r)
			goto out_reschedule;
1820 1821

		if (atomic_cmpxchg(&prange->invalid, invalid, 0) != invalid)
1822
			goto out_reschedule;
1823 1824 1825 1826
	}

	if (atomic_cmpxchg(&svms->evicted_ranges, evicted_ranges, 0) !=
	    evicted_ranges)
1827
		goto out_reschedule;
1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840

	evicted_ranges = 0;

	r = kgd2kfd_resume_mm(mm);
	if (r) {
		/* No recovery from this failure. Probably the CP is
		 * hanging. No point trying again.
		 */
		pr_debug("failed %d to resume KFD\n", r);
	}

	pr_debug("restore svm ranges successfully\n");

1841
out_reschedule:
1842 1843
	mutex_unlock(&svms->lock);
	mmap_write_unlock(mm);
1844
	mutex_unlock(&process_info->lock);
1845 1846 1847 1848

	/* If validation failed, reschedule another attempt */
	if (evicted_ranges) {
		pr_debug("reschedule to restore svm range\n");
1849
		queue_delayed_work(system_freezable_wq, &svms->restore_work,
1850
			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
1851 1852

		kfd_smi_event_queue_restore_rescheduled(mm);
1853
	}
1854
	mmput(mm);
1855 1856 1857 1858
}

/**
 * svm_range_evict - evict svm range
1859 1860 1861 1862
 * @prange: svm range structure
 * @mm: current process mm_struct
 * @start: starting process queue number
 * @last: last process queue number
1863
 * @event: mmu notifier event when range is evicted or migrated
1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874
 *
 * Stop all queues of the process to ensure GPU doesn't access the memory, then
 * return to let CPU evict the buffer and proceed CPU pagetable update.
 *
 * Don't need use lock to sync cpu pagetable invalidation with GPU execution.
 * If invalidation happens while restore work is running, restore work will
 * restart to ensure to get the latest CPU pages mapping to GPU, then start
 * the queues.
 */
static int
svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
1875 1876
		unsigned long start, unsigned long last,
		enum mmu_notifier_event event)
1877 1878
{
	struct svm_range_list *svms = prange->svms;
1879
	struct svm_range *pchild;
1880
	struct kfd_process *p;
1881 1882
	int r = 0;

1883
	p = container_of(svms, struct kfd_process, svms);
1884

1885 1886
	pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
		 svms, prange->start, prange->last, start, last);
1887

1888 1889
	if (!p->xnack_enabled ||
	    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) {
1890
		int evicted_ranges;
1891
		bool mapped = prange->mapped_to_gpu;
1892

1893
		list_for_each_entry(pchild, &prange->child_list, child_list) {
1894 1895 1896
			if (!pchild->mapped_to_gpu)
				continue;
			mapped = true;
1897 1898 1899 1900 1901 1902 1903 1904 1905
			mutex_lock_nested(&pchild->lock, 1);
			if (pchild->start <= last && pchild->last >= start) {
				pr_debug("increment pchild invalid [0x%lx 0x%lx]\n",
					 pchild->start, pchild->last);
				atomic_inc(&pchild->invalid);
			}
			mutex_unlock(&pchild->lock);
		}

1906 1907 1908
		if (!mapped)
			return r;

1909 1910 1911
		if (prange->start <= last && prange->last >= start)
			atomic_inc(&prange->invalid);

1912 1913 1914 1915 1916 1917 1918 1919
		evicted_ranges = atomic_inc_return(&svms->evicted_ranges);
		if (evicted_ranges != 1)
			return r;

		pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n",
			 prange->svms, prange->start, prange->last);

		/* First eviction, stop the queues */
1920
		r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
1921 1922 1923 1924
		if (r)
			pr_debug("failed to quiesce KFD\n");

		pr_debug("schedule to restore svm %p ranges\n", svms);
1925
		queue_delayed_work(system_freezable_wq, &svms->restore_work,
1926 1927 1928
			msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
	} else {
		unsigned long s, l;
1929 1930 1931 1932 1933 1934
		uint32_t trigger;

		if (event == MMU_NOTIFY_MIGRATE)
			trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE;
		else
			trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY;
1935 1936 1937 1938 1939 1940 1941 1942

		pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
			 prange->svms, start, last);
		list_for_each_entry(pchild, &prange->child_list, child_list) {
			mutex_lock_nested(&pchild->lock, 1);
			s = max(start, pchild->start);
			l = min(last, pchild->last);
			if (l >= s)
1943
				svm_range_unmap_from_gpus(pchild, s, l, trigger);
1944 1945 1946 1947 1948
			mutex_unlock(&pchild->lock);
		}
		s = max(start, prange->start);
		l = min(last, prange->last);
		if (l >= s)
1949
			svm_range_unmap_from_gpus(prange, s, l, trigger);
1950
	}
1951 1952 1953 1954

	return r;
}

1955 1956 1957 1958
static struct svm_range *svm_range_clone(struct svm_range *old)
{
	struct svm_range *new;

1959
	new = svm_range_new(old->svms, old->start, old->last, false);
1960 1961
	if (!new)
		return NULL;
1962 1963 1964 1965
	if (svm_range_copy_dma_addrs(new, old)) {
		svm_range_free(new, false);
		return NULL;
	}
1966 1967 1968 1969 1970 1971 1972 1973
	if (old->svm_bo) {
		new->ttm_res = old->ttm_res;
		new->offset = old->offset;
		new->svm_bo = svm_range_bo_ref(old->svm_bo);
		spin_lock(&new->svm_bo->list_lock);
		list_add(&new->svm_bo_list, &new->svm_bo->range_list);
		spin_unlock(&new->svm_bo->list_lock);
	}
1974 1975 1976 1977 1978
	new->flags = old->flags;
	new->preferred_loc = old->preferred_loc;
	new->prefetch_loc = old->prefetch_loc;
	new->actual_loc = old->actual_loc;
	new->granularity = old->granularity;
1979
	new->mapped_to_gpu = old->mapped_to_gpu;
1980
	new->vram_pages = old->vram_pages;
1981 1982 1983 1984 1985 1986
	bitmap_copy(new->bitmap_access, old->bitmap_access, MAX_GPU_INSTANCE);
	bitmap_copy(new->bitmap_aip, old->bitmap_aip, MAX_GPU_INSTANCE);

	return new;
}

1987 1988 1989 1990
void svm_range_set_max_pages(struct amdgpu_device *adev)
{
	uint64_t max_pages;
	uint64_t pages, _pages;
1991
	uint64_t min_pages = 0;
1992
	int i, id;
1993 1994

	for (i = 0; i < adev->kfd.dev->num_nodes; i++) {
1995 1996 1997 1998 1999
		if (adev->kfd.dev->nodes[i]->xcp)
			id = adev->kfd.dev->nodes[i]->xcp->id;
		else
			id = -1;
		pages = KFD_XCP_MEMORY_SIZE(adev, id) >> 17;
2000 2001 2002 2003
		pages = clamp(pages, 1ULL << 9, 1ULL << 18);
		pages = rounddown_pow_of_two(pages);
		min_pages = min_not_zero(min_pages, pages);
	}
2004 2005 2006

	do {
		max_pages = READ_ONCE(max_svm_range_pages);
2007
		_pages = min_not_zero(max_pages, min_pages);
2008 2009 2010
	} while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages);
}

2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
static int
svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last,
		    uint64_t max_pages, struct list_head *insert_list,
		    struct list_head *update_list)
{
	struct svm_range *prange;
	uint64_t l;

	pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n",
		 max_pages, start, last);

	while (last >= start) {
		l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1);

2025
		prange = svm_range_new(svms, start, l, true);
2026 2027 2028 2029 2030 2031 2032 2033 2034 2035
		if (!prange)
			return -ENOMEM;
		list_add(&prange->list, insert_list);
		list_add(&prange->update_list, update_list);

		start = l + 1;
	}
	return 0;
}

2036
/**
2037 2038 2039 2040 2041 2042 2043 2044 2045
 * svm_range_add - add svm range and handle overlap
 * @p: the range add to this process svms
 * @start: page size aligned
 * @size: page size aligned
 * @nattr: number of attributes
 * @attrs: array of attributes
 * @update_list: output, the ranges need validate and update GPU mapping
 * @insert_list: output, the ranges need insert to svms
 * @remove_list: output, the ranges are replaced and need remove from svms
2046
 * @remap_list: output, remap unaligned svm ranges
2047
 *
2048 2049 2050 2051 2052 2053
 * Check if the virtual address range has overlap with any existing ranges,
 * split partly overlapping ranges and add new ranges in the gaps. All changes
 * should be applied to the range_list and interval tree transactionally. If
 * any range split or allocation fails, the entire update fails. Therefore any
 * existing overlapping svm_ranges are cloned and the original svm_ranges left
 * unchanged.
2054
 *
2055 2056
 * If the transaction succeeds, the caller can update and insert clones and
 * new ranges, then free the originals.
2057
 *
2058 2059 2060 2061 2062 2063 2064
 * Otherwise the caller can free the clones and new ranges, while the old
 * svm_ranges remain unchanged.
 *
 * Context: Process context, caller must hold svms->lock
 *
 * Return:
 * 0 - OK, otherwise error code
2065 2066
 */
static int
2067 2068 2069
svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
	      uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs,
	      struct list_head *update_list, struct list_head *insert_list,
2070
	      struct list_head *remove_list, struct list_head *remap_list)
2071
{
2072 2073
	unsigned long last = start + size - 1UL;
	struct svm_range_list *svms = &p->svms;
2074 2075 2076
	struct interval_tree_node *node;
	struct svm_range *prange;
	struct svm_range *tmp;
2077
	struct list_head new_list;
2078 2079
	int r = 0;

2080 2081
	pr_debug("svms 0x%p [0x%llx 0x%lx]\n", &p->svms, start, last);

2082 2083 2084
	INIT_LIST_HEAD(update_list);
	INIT_LIST_HEAD(insert_list);
	INIT_LIST_HEAD(remove_list);
2085
	INIT_LIST_HEAD(&new_list);
2086
	INIT_LIST_HEAD(remap_list);
2087 2088 2089 2090 2091 2092 2093 2094 2095

	node = interval_tree_iter_first(&svms->objects, start, last);
	while (node) {
		struct interval_tree_node *next;
		unsigned long next_start;

		pr_debug("found overlap node [0x%lx 0x%lx]\n", node->start,
			 node->last);

2096
		prange = container_of(node, struct svm_range, it_node);
2097 2098 2099
		next = interval_tree_iter_next(node, start, last);
		next_start = min(node->last, last) + 1;

2100 2101
		if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
		    prange->mapped_to_gpu) {
2102 2103 2104 2105 2106 2107 2108 2109
			/* nothing to do */
		} else if (node->start < start || node->last > last) {
			/* node intersects the update range and its attributes
			 * will change. Clone and split it, apply updates only
			 * to the overlapping part
			 */
			struct svm_range *old = prange;

2110 2111 2112 2113 2114 2115
			prange = svm_range_clone(old);
			if (!prange) {
				r = -ENOMEM;
				goto out;
			}

2116
			list_add(&old->update_list, remove_list);
2117
			list_add(&prange->list, insert_list);
2118
			list_add(&prange->update_list, update_list);
2119 2120 2121

			if (node->start < start) {
				pr_debug("change old range start\n");
2122
				r = svm_range_split_head(prange, start,
2123
							 insert_list, remap_list);
2124 2125 2126 2127 2128
				if (r)
					goto out;
			}
			if (node->last > last) {
				pr_debug("change old range last\n");
2129
				r = svm_range_split_tail(prange, last,
2130
							 insert_list, remap_list);
2131 2132 2133 2134 2135 2136 2137 2138
				if (r)
					goto out;
			}
		} else {
			/* The node is contained within start..last,
			 * just update it
			 */
			list_add(&prange->update_list, update_list);
2139
		}
2140 2141 2142

		/* insert a new node if needed */
		if (node->start > start) {
2143 2144
			r = svm_range_split_new(svms, start, node->start - 1,
						READ_ONCE(max_svm_range_pages),
2145
						&new_list, update_list);
2146
			if (r)
2147 2148 2149 2150 2151 2152 2153
				goto out;
		}

		node = next;
		start = next_start;
	}

2154
	/* add a final range at the end if needed */
2155 2156 2157
	if (start <= last)
		r = svm_range_split_new(svms, start, last,
					READ_ONCE(max_svm_range_pages),
2158
					&new_list, update_list);
2159 2160

out:
2161
	if (r) {
2162
		list_for_each_entry_safe(prange, tmp, insert_list, list)
2163 2164 2165 2166 2167 2168
			svm_range_free(prange, false);
		list_for_each_entry_safe(prange, tmp, &new_list, list)
			svm_range_free(prange, true);
	} else {
		list_splice(&new_list, insert_list);
	}
2169 2170 2171 2172

	return r;
}

2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
static void
svm_range_update_notifier_and_interval_tree(struct mm_struct *mm,
					    struct svm_range *prange)
{
	unsigned long start;
	unsigned long last;

	start = prange->notifier.interval_tree.start >> PAGE_SHIFT;
	last = prange->notifier.interval_tree.last >> PAGE_SHIFT;

	if (prange->start == start && prange->last == last)
		return;

	pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n",
		  prange->svms, prange, start, last, prange->start,
		  prange->last);

	if (start != 0 && last != 0) {
		interval_tree_remove(&prange->it_node, &prange->svms->objects);
		svm_range_remove_notifier(prange);
	}
	prange->it_node.start = prange->start;
	prange->it_node.last = prange->last;

	interval_tree_insert(&prange->it_node, &prange->svms->objects);
	svm_range_add_notifier_locked(mm, prange);
}

static void
2202 2203
svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange,
			 struct mm_struct *mm)
2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214
{
	switch (prange->work_item.op) {
	case SVM_OP_NULL:
		pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n",
			 svms, prange, prange->start, prange->last);
		break;
	case SVM_OP_UNMAP_RANGE:
		pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n",
			 svms, prange, prange->start, prange->last);
		svm_range_unlink(prange);
		svm_range_remove_notifier(prange);
2215
		svm_range_free(prange, true);
2216 2217 2218 2219 2220 2221
		break;
	case SVM_OP_UPDATE_RANGE_NOTIFIER:
		pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n",
			 svms, prange, prange->start, prange->last);
		svm_range_update_notifier_and_interval_tree(mm, prange);
		break;
2222 2223 2224 2225 2226 2227
	case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP:
		pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n",
			 svms, prange, prange->start, prange->last);
		svm_range_update_notifier_and_interval_tree(mm, prange);
		/* TODO: implement deferred validation and mapping */
		break;
2228 2229 2230 2231 2232 2233
	case SVM_OP_ADD_RANGE:
		pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms, prange,
			 prange->start, prange->last);
		svm_range_add_to_svms(prange);
		svm_range_add_notifier_locked(mm, prange);
		break;
2234 2235 2236 2237 2238 2239 2240
	case SVM_OP_ADD_RANGE_AND_MAP:
		pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n", svms,
			 prange, prange->start, prange->last);
		svm_range_add_to_svms(prange);
		svm_range_add_notifier_locked(mm, prange);
		/* TODO: implement deferred validation and mapping */
		break;
2241 2242 2243 2244 2245 2246
	default:
		WARN_ONCE(1, "Unknown prange 0x%p work op %d\n", prange,
			 prange->work_item.op);
	}
}

2247 2248 2249 2250
static void svm_range_drain_retry_fault(struct svm_range_list *svms)
{
	struct kfd_process_device *pdd;
	struct kfd_process *p;
2251
	int drain;
2252 2253 2254 2255
	uint32_t i;

	p = container_of(svms, struct kfd_process, svms);

2256 2257 2258 2259 2260
restart:
	drain = atomic_read(&svms->drain_pagefaults);
	if (!drain)
		return;

2261
	for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
2262 2263 2264 2265 2266 2267
		pdd = p->pdds[i];
		if (!pdd)
			continue;

		pr_debug("drain retry fault gpu %d svms %p\n", i, svms);

2268
		amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
2269 2270 2271 2272 2273 2274 2275 2276 2277
				pdd->dev->adev->irq.retry_cam_enabled ?
				&pdd->dev->adev->irq.ih :
				&pdd->dev->adev->irq.ih1);

		if (pdd->dev->adev->irq.retry_cam_enabled)
			amdgpu_ih_wait_on_checkpoint_process_ts(pdd->dev->adev,
				&pdd->dev->adev->irq.ih_soft);


2278 2279
		pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
	}
2280 2281
	if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
		goto restart;
2282 2283
}

2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297
static void svm_range_deferred_list_work(struct work_struct *work)
{
	struct svm_range_list *svms;
	struct svm_range *prange;
	struct mm_struct *mm;

	svms = container_of(work, struct svm_range_list, deferred_list_work);
	pr_debug("enter svms 0x%p\n", svms);

	spin_lock(&svms->deferred_list_lock);
	while (!list_empty(&svms->deferred_range_list)) {
		prange = list_first_entry(&svms->deferred_range_list,
					  struct svm_range, deferred_list);
		spin_unlock(&svms->deferred_list_lock);
2298

2299 2300 2301
		pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
			 prange->start, prange->last, prange->work_item.op);

2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326
		mm = prange->work_item.mm;
retry:
		mmap_write_lock(mm);

		/* Checking for the need to drain retry faults must be inside
		 * mmap write lock to serialize with munmap notifiers.
		 */
		if (unlikely(atomic_read(&svms->drain_pagefaults))) {
			mmap_write_unlock(mm);
			svm_range_drain_retry_fault(svms);
			goto retry;
		}

		/* Remove from deferred_list must be inside mmap write lock, for
		 * two race cases:
		 * 1. unmap_from_cpu may change work_item.op and add the range
		 *    to deferred_list again, cause use after free bug.
		 * 2. svm_range_list_lock_and_flush_work may hold mmap write
		 *    lock and continue because deferred_list is empty, but
		 *    deferred_list work is actually waiting for mmap lock.
		 */
		spin_lock(&svms->deferred_list_lock);
		list_del_init(&prange->deferred_list);
		spin_unlock(&svms->deferred_list_lock);

2327
		mutex_lock(&svms->lock);
2328
		mutex_lock(&prange->migrate_mutex);
2329 2330 2331 2332 2333 2334 2335 2336
		while (!list_empty(&prange->child_list)) {
			struct svm_range *pchild;

			pchild = list_first_entry(&prange->child_list,
						struct svm_range, child_list);
			pr_debug("child prange 0x%p op %d\n", pchild,
				 pchild->work_item.op);
			list_del_init(&pchild->child_list);
2337
			svm_range_handle_list_op(svms, pchild, mm);
2338
		}
2339
		mutex_unlock(&prange->migrate_mutex);
2340

2341
		svm_range_handle_list_op(svms, prange, mm);
2342
		mutex_unlock(&svms->lock);
2343 2344
		mmap_write_unlock(mm);

2345 2346 2347 2348
		/* Pairs with mmget in svm_range_add_list_work. If dropping the
		 * last mm refcount, schedule release work to avoid circular locking
		 */
		mmput_async(mm);
2349 2350 2351 2352 2353 2354 2355

		spin_lock(&svms->deferred_list_lock);
	}
	spin_unlock(&svms->deferred_list_lock);
	pr_debug("exit svms 0x%p\n", svms);
}

2356
void
2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369
svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
			struct mm_struct *mm, enum svm_work_list_ops op)
{
	spin_lock(&svms->deferred_list_lock);
	/* if prange is on the deferred list */
	if (!list_empty(&prange->deferred_list)) {
		pr_debug("update exist prange 0x%p work op %d\n", prange, op);
		WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n");
		if (op != SVM_OP_NULL &&
		    prange->work_item.op != SVM_OP_UNMAP_RANGE)
			prange->work_item.op = op;
	} else {
		prange->work_item.op = op;
2370 2371 2372

		/* Pairs with mmput in deferred_list_work */
		mmget(mm);
2373 2374 2375 2376 2377 2378 2379 2380 2381
		prange->work_item.mm = mm;
		list_add_tail(&prange->deferred_list,
			      &prange->svms->deferred_range_list);
		pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n",
			 prange, prange->start, prange->last, op);
	}
	spin_unlock(&svms->deferred_list_lock);
}

2382
void schedule_deferred_list_work(struct svm_range_list *svms)
2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427
{
	spin_lock(&svms->deferred_list_lock);
	if (!list_empty(&svms->deferred_range_list))
		schedule_work(&svms->deferred_list_work);
	spin_unlock(&svms->deferred_list_lock);
}

static void
svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent,
		      struct svm_range *prange, unsigned long start,
		      unsigned long last)
{
	struct svm_range *head;
	struct svm_range *tail;

	if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
		pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n", prange,
			 prange->start, prange->last);
		return;
	}
	if (start > prange->last || last < prange->start)
		return;

	head = tail = prange;
	if (start > prange->start)
		svm_range_split(prange, prange->start, start - 1, &tail);
	if (last < tail->last)
		svm_range_split(tail, last + 1, tail->last, &head);

	if (head != prange && tail != prange) {
		svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
		svm_range_add_child(parent, mm, tail, SVM_OP_ADD_RANGE);
	} else if (tail != prange) {
		svm_range_add_child(parent, mm, tail, SVM_OP_UNMAP_RANGE);
	} else if (head != prange) {
		svm_range_add_child(parent, mm, head, SVM_OP_UNMAP_RANGE);
	} else if (parent != prange) {
		prange->work_item.op = SVM_OP_UNMAP_RANGE;
	}
}

static void
svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
			 unsigned long start, unsigned long last)
{
2428
	uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU;
2429 2430 2431
	struct svm_range_list *svms;
	struct svm_range *pchild;
	struct kfd_process *p;
2432
	unsigned long s, l;
2433 2434 2435 2436 2437 2438 2439 2440 2441 2442
	bool unmap_parent;

	p = kfd_lookup_process_by_mm(mm);
	if (!p)
		return;
	svms = &p->svms;

	pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
		 prange, prange->start, prange->last, start, last);

2443 2444 2445 2446 2447 2448
	/* Make sure pending page faults are drained in the deferred worker
	 * before the range is freed to avoid straggler interrupts on
	 * unmapped memory causing "phantom faults".
	 */
	atomic_inc(&svms->drain_pagefaults);

2449 2450
	unmap_parent = start <= prange->start && last >= prange->last;

2451 2452 2453 2454 2455
	list_for_each_entry(pchild, &prange->child_list, child_list) {
		mutex_lock_nested(&pchild->lock, 1);
		s = max(start, pchild->start);
		l = min(last, pchild->last);
		if (l >= s)
2456
			svm_range_unmap_from_gpus(pchild, s, l, trigger);
2457
		svm_range_unmap_split(mm, prange, pchild, start, last);
2458 2459 2460 2461 2462
		mutex_unlock(&pchild->lock);
	}
	s = max(start, prange->start);
	l = min(last, prange->last);
	if (l >= s)
2463
		svm_range_unmap_from_gpus(prange, s, l, trigger);
2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475
	svm_range_unmap_split(mm, prange, prange, start, last);

	if (unmap_parent)
		svm_range_add_list_work(svms, prange, mm, SVM_OP_UNMAP_RANGE);
	else
		svm_range_add_list_work(svms, prange, mm,
					SVM_OP_UPDATE_RANGE_NOTIFIER);
	schedule_deferred_list_work(svms);

	kfd_unref_process(p);
}

2476 2477
/**
 * svm_range_cpu_invalidate_pagetables - interval notifier callback
2478 2479 2480
 * @mni: mmu_interval_notifier struct
 * @range: mmu_notifier_range struct
 * @cur_seq: value to pass to mmu_interval_set_seq()
2481
 *
2482 2483 2484 2485 2486
 * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it
 * is from migration, or CPU page invalidation callback.
 *
 * For unmap event, unmap range from GPUs, remove prange from svms in a delayed
 * work thread, and split prange if only part of prange is unmapped.
2487
 *
2488 2489 2490 2491 2492 2493 2494
 * For invalidation event, if GPU retry fault is not enabled, evict the queues,
 * then schedule svm_range_restore_work to update GPU mapping and resume queues.
 * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will
 * update GPU mapping to recover.
 *
 * Context: mmap lock, notifier_invalidate_start lock are held
 *          for invalidate event, prange lock is held if this is from migration
2495 2496 2497 2498 2499 2500
 */
static bool
svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
				    const struct mmu_notifier_range *range,
				    unsigned long cur_seq)
{
2501 2502 2503 2504 2505 2506
	struct svm_range *prange;
	unsigned long start;
	unsigned long last;

	if (range->event == MMU_NOTIFY_RELEASE)
		return true;
2507 2508
	if (!mmget_not_zero(mni->mm))
		return true;
2509 2510 2511

	start = mni->interval_tree.start;
	last = mni->interval_tree.last;
2512 2513
	start = max(start, range->start) >> PAGE_SHIFT;
	last = min(last, range->end - 1) >> PAGE_SHIFT;
2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529
	pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n",
		 start, last, range->start >> PAGE_SHIFT,
		 (range->end - 1) >> PAGE_SHIFT,
		 mni->interval_tree.start >> PAGE_SHIFT,
		 mni->interval_tree.last >> PAGE_SHIFT, range->event);

	prange = container_of(mni, struct svm_range, notifier);

	svm_range_lock(prange);
	mmu_interval_set_seq(mni, cur_seq);

	switch (range->event) {
	case MMU_NOTIFY_UNMAP:
		svm_range_unmap_from_cpu(mni->mm, prange, start, last);
		break;
	default:
2530
		svm_range_evict(prange, mni->mm, start, last, range->event);
2531 2532 2533 2534
		break;
	}

	svm_range_unlock(prange);
2535
	mmput(mni->mm);
2536

2537 2538 2539
	return true;
}

2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582
/**
 * svm_range_from_addr - find svm range from fault address
 * @svms: svm range list header
 * @addr: address to search range interval tree, in pages
 * @parent: parent range if range is on child list
 *
 * Context: The caller must hold svms->lock
 *
 * Return: the svm_range found or NULL
 */
struct svm_range *
svm_range_from_addr(struct svm_range_list *svms, unsigned long addr,
		    struct svm_range **parent)
{
	struct interval_tree_node *node;
	struct svm_range *prange;
	struct svm_range *pchild;

	node = interval_tree_iter_first(&svms->objects, addr, addr);
	if (!node)
		return NULL;

	prange = container_of(node, struct svm_range, it_node);
	pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n",
		 addr, prange->start, prange->last, node->start, node->last);

	if (addr >= prange->start && addr <= prange->last) {
		if (parent)
			*parent = prange;
		return prange;
	}
	list_for_each_entry(pchild, &prange->child_list, child_list)
		if (addr >= pchild->start && addr <= pchild->last) {
			pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n",
				 addr, pchild->start, pchild->last);
			if (parent)
				*parent = prange;
			return pchild;
		}

	return NULL;
}

2583 2584 2585 2586 2587 2588 2589 2590 2591
/* svm_range_best_restore_location - decide the best fault restore location
 * @prange: svm range structure
 * @adev: the GPU on which vm fault happened
 *
 * This is only called when xnack is on, to decide the best location to restore
 * the range mapping after GPU vm fault. Caller uses the best location to do
 * migration if actual loc is not best location, then update GPU page table
 * mapping to the best location.
 *
2592
 * If the preferred loc is accessible by faulting GPU, use preferred loc.
2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605
 * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu
 * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then
 *    if range actual loc is cpu, best_loc is cpu
 *    if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is
 *    range actual loc.
 * Otherwise, GPU no access, best_loc is -1.
 *
 * Return:
 * -1 means vm fault GPU no access
 * 0 for CPU or GPU id
 */
static int32_t
svm_range_best_restore_location(struct svm_range *prange,
2606
				struct kfd_node *node,
2607 2608
				int32_t *gpuidx)
{
2609
	struct kfd_node *bo_node, *preferred_node;
2610 2611 2612 2613 2614 2615
	struct kfd_process *p;
	uint32_t gpuid;
	int r;

	p = container_of(prange->svms, struct kfd_process, svms);

2616
	r = kfd_process_gpuid_from_node(p, node, &gpuid, gpuidx);
2617 2618 2619 2620 2621
	if (r < 0) {
		pr_debug("failed to get gpuid from kgd\n");
		return -1;
	}

2622
	if (node->adev->flags & AMD_IS_APU)
2623 2624
		return 0;

2625 2626
	if (prange->preferred_loc == gpuid ||
	    prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) {
2627
		return prange->preferred_loc;
2628
	} else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
2629 2630
		preferred_node = svm_range_get_node_by_id(prange, prange->preferred_loc);
		if (preferred_node && svm_nodes_in_same_hive(node, preferred_node))
2631 2632 2633
			return prange->preferred_loc;
		/* fall through */
	}
2634 2635 2636 2637 2638 2639 2640 2641

	if (test_bit(*gpuidx, prange->bitmap_access))
		return gpuid;

	if (test_bit(*gpuidx, prange->bitmap_aip)) {
		if (!prange->actual_loc)
			return 0;

2642 2643
		bo_node = svm_range_get_node_by_id(prange, prange->actual_loc);
		if (bo_node && svm_nodes_in_same_hive(node, bo_node))
2644 2645 2646 2647 2648 2649 2650
			return prange->actual_loc;
		else
			return 0;
	}

	return -1;
}
2651

2652 2653
static int
svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr,
2654 2655
			       unsigned long *start, unsigned long *last,
			       bool *is_heap_stack)
2656 2657 2658
{
	struct vm_area_struct *vma;
	struct interval_tree_node *node;
2659
	struct rb_node *rb_node;
2660 2661
	unsigned long start_limit, end_limit;

2662 2663
	vma = vma_lookup(p->mm, addr << PAGE_SHIFT);
	if (!vma) {
2664 2665 2666
		pr_debug("VMA does not exist in address [0x%llx]\n", addr);
		return -EFAULT;
	}
2667

2668
	*is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma);
2669

2670 2671 2672 2673 2674 2675 2676 2677 2678
	start_limit = max(vma->vm_start >> PAGE_SHIFT,
		      (unsigned long)ALIGN_DOWN(addr, 2UL << 8));
	end_limit = min(vma->vm_end >> PAGE_SHIFT,
		    (unsigned long)ALIGN(addr + 1, 2UL << 8));
	/* First range that starts after the fault address */
	node = interval_tree_iter_first(&p->svms.objects, addr + 1, ULONG_MAX);
	if (node) {
		end_limit = min(end_limit, node->start);
		/* Last range that ends before the fault address */
2679
		rb_node = rb_prev(&node->rb);
2680 2681 2682 2683
	} else {
		/* Last range must end before addr because
		 * there was no range after addr
		 */
2684
		rb_node = rb_last(&p->svms.objects.rb_root);
2685
	}
2686 2687
	if (rb_node) {
		node = container_of(rb_node, struct interval_tree_node, rb);
2688 2689 2690 2691 2692 2693 2694 2695 2696 2697
		if (node->last >= addr) {
			WARN(1, "Overlap with prev node and page fault addr\n");
			return -EFAULT;
		}
		start_limit = max(start_limit, node->last + 1);
	}

	*start = start_limit;
	*last = end_limit - 1;

2698 2699 2700
	pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n",
		 vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT,
		 *start, *last, *is_heap_stack);
2701 2702

	return 0;
2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714
}

static int
svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last,
			   uint64_t *bo_s, uint64_t *bo_l)
{
	struct amdgpu_bo_va_mapping *mapping;
	struct interval_tree_node *node;
	struct amdgpu_bo *bo = NULL;
	unsigned long userptr;
	uint32_t i;
	int r;
2715

2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753
	for (i = 0; i < p->n_pdds; i++) {
		struct amdgpu_vm *vm;

		if (!p->pdds[i]->drm_priv)
			continue;

		vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
		r = amdgpu_bo_reserve(vm->root.bo, false);
		if (r)
			return r;

		/* Check userptr by searching entire vm->va interval tree */
		node = interval_tree_iter_first(&vm->va, 0, ~0ULL);
		while (node) {
			mapping = container_of((struct rb_node *)node,
					       struct amdgpu_bo_va_mapping, rb);
			bo = mapping->bo_va->base.bo;

			if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm,
							 start << PAGE_SHIFT,
							 last << PAGE_SHIFT,
							 &userptr)) {
				node = interval_tree_iter_next(node, 0, ~0ULL);
				continue;
			}

			pr_debug("[0x%llx 0x%llx] already userptr mapped\n",
				 start, last);
			if (bo_s && bo_l) {
				*bo_s = userptr >> PAGE_SHIFT;
				*bo_l = *bo_s + bo->tbo.ttm->num_pages - 1;
			}
			amdgpu_bo_unreserve(vm->root.bo);
			return -EADDRINUSE;
		}
		amdgpu_bo_unreserve(vm->root.bo);
	}
	return 0;
2754
}
2755

2756
static struct
2757
svm_range *svm_range_create_unregistered_range(struct kfd_node *node,
2758 2759 2760 2761 2762 2763 2764
						struct kfd_process *p,
						struct mm_struct *mm,
						int64_t addr)
{
	struct svm_range *prange = NULL;
	unsigned long start, last;
	uint32_t gpuid, gpuidx;
2765
	bool is_heap_stack;
2766 2767 2768
	uint64_t bo_s = 0;
	uint64_t bo_l = 0;
	int r;
2769

2770 2771
	if (svm_range_get_range_boundaries(p, addr, &start, &last,
					   &is_heap_stack))
2772 2773
		return NULL;

2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786
	r = svm_range_check_vm(p, start, last, &bo_s, &bo_l);
	if (r != -EADDRINUSE)
		r = svm_range_check_vm_userptr(p, start, last, &bo_s, &bo_l);

	if (r == -EADDRINUSE) {
		if (addr >= bo_s && addr <= bo_l)
			return NULL;

		/* Create one page svm range if 2MB range overlapping */
		start = addr;
		last = addr;
	}

2787
	prange = svm_range_new(&p->svms, start, last, true);
2788
	if (!prange) {
2789
		pr_debug("Failed to create prange in address [0x%llx]\n", addr);
2790 2791
		return NULL;
	}
2792
	if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
2793
		pr_debug("failed to get gpuid from kgd\n");
2794
		svm_range_free(prange, true);
2795 2796
		return NULL;
	}
2797

2798 2799 2800
	if (is_heap_stack)
		prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM;

2801 2802 2803 2804 2805
	svm_range_add_to_svms(prange);
	svm_range_add_notifier_locked(mm, prange);

	return prange;
}
2806

2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844
/* svm_range_skip_recover - decide if prange can be recovered
 * @prange: svm range structure
 *
 * GPU vm retry fault handle skip recover the range for cases:
 * 1. prange is on deferred list to be removed after unmap, it is stale fault,
 *    deferred list work will drain the stale fault before free the prange.
 * 2. prange is on deferred list to add interval notifier after split, or
 * 3. prange is child range, it is split from parent prange, recover later
 *    after interval notifier is added.
 *
 * Return: true to skip recover, false to recover
 */
static bool svm_range_skip_recover(struct svm_range *prange)
{
	struct svm_range_list *svms = prange->svms;

	spin_lock(&svms->deferred_list_lock);
	if (list_empty(&prange->deferred_list) &&
	    list_empty(&prange->child_list)) {
		spin_unlock(&svms->deferred_list_lock);
		return false;
	}
	spin_unlock(&svms->deferred_list_lock);

	if (prange->work_item.op == SVM_OP_UNMAP_RANGE) {
		pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n",
			 svms, prange, prange->start, prange->last);
		return true;
	}
	if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP ||
	    prange->work_item.op == SVM_OP_ADD_RANGE) {
		pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n",
			 svms, prange, prange->start, prange->last);
		return true;
	}
	return false;
}

2845
static void
2846
svm_range_count_fault(struct kfd_node *node, struct kfd_process *p,
2847
		      int32_t gpuidx)
2848 2849 2850
{
	struct kfd_process_device *pdd;

2851 2852 2853 2854 2855 2856 2857
	/* fault is on different page of same range
	 * or fault is skipped to recover later
	 * or fault is on invalid virtual address
	 */
	if (gpuidx == MAX_GPU_INSTANCE) {
		uint32_t gpuid;
		int r;
2858

2859
		r = kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx);
2860 2861 2862 2863 2864 2865 2866 2867
		if (r < 0)
			return;
	}

	/* fault is recovered
	 * or fault cannot recover because GPU no access on the range
	 */
	pdd = kfd_process_device_from_gpuidx(p, gpuidx);
2868 2869 2870 2871
	if (pdd)
		WRITE_ONCE(pdd->faults, pdd->faults + 1);
}

2872
static bool
2873
svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884
{
	unsigned long requested = VM_READ;

	if (write_fault)
		requested |= VM_WRITE;

	pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
		vma->vm_flags);
	return (vma->vm_flags & requested) == requested;
}

2885 2886
int
svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
2887
			uint32_t vmid, uint32_t node_id,
2888
			uint64_t addr, bool write_fault)
2889
{
2890
	unsigned long start, last, size;
2891 2892
	struct mm_struct *mm = NULL;
	struct svm_range_list *svms;
2893
	struct svm_range *prange;
2894
	struct kfd_process *p;
2895
	ktime_t timestamp = ktime_get_boottime();
2896
	struct kfd_node *node;
2897 2898
	int32_t best_loc;
	int32_t gpuidx = MAX_GPU_INSTANCE;
2899
	bool write_locked = false;
2900
	struct vm_area_struct *vma;
2901
	bool migration = false;
2902
	int r = 0;
2903

2904
	if (!KFD_IS_SVM_API_SUPPORTED(adev)) {
2905 2906 2907 2908
		pr_debug("device does not support SVM\n");
		return -EFAULT;
	}

2909 2910 2911
	p = kfd_lookup_process_by_pasid(pasid);
	if (!p) {
		pr_debug("kfd process not founded pasid 0x%x\n", pasid);
2912
		return 0;
2913 2914 2915 2916 2917
	}
	svms = &p->svms;

	pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);

2918 2919
	if (atomic_read(&svms->drain_pagefaults)) {
		pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
2920
		r = 0;
2921 2922 2923
		goto out;
	}

2924 2925 2926 2927 2928 2929
	if (!p->xnack_enabled) {
		pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
		r = -EFAULT;
		goto out;
	}

2930 2931 2932
	/* p->lead_thread is available as kfd_process_wq_release flush the work
	 * before releasing task ref.
	 */
2933 2934 2935
	mm = get_task_mm(p->lead_thread);
	if (!mm) {
		pr_debug("svms 0x%p failed to get mm\n", svms);
2936
		r = 0;
2937 2938 2939
		goto out;
	}

2940
	node = kfd_node_by_irq_ids(adev, node_id, vmid);
2941
	if (!node) {
2942 2943
		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
			 vmid);
2944 2945 2946
		r = -EFAULT;
		goto out;
	}
2947
	mmap_read_lock(mm);
2948
retry_write_locked:
2949 2950 2951 2952 2953
	mutex_lock(&svms->lock);
	prange = svm_range_from_addr(svms, addr, NULL);
	if (!prange) {
		pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
			 svms, addr);
2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964
		if (!write_locked) {
			/* Need the write lock to create new range with MMU notifier.
			 * Also flush pending deferred work to make sure the interval
			 * tree is up to date before we add a new range
			 */
			mutex_unlock(&svms->lock);
			mmap_read_unlock(mm);
			mmap_write_lock(mm);
			write_locked = true;
			goto retry_write_locked;
		}
2965
		prange = svm_range_create_unregistered_range(node, p, mm, addr);
2966
		if (!prange) {
2967
			pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n",
2968 2969 2970 2971 2972
				 svms, addr);
			mmap_write_downgrade(mm);
			r = -EFAULT;
			goto out_unlock_svms;
		}
2973
	}
2974 2975
	if (write_locked)
		mmap_write_downgrade(mm);
2976 2977

	mutex_lock(&prange->migrate_mutex);
2978

2979
	if (svm_range_skip_recover(prange)) {
2980
		amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid);
2981
		r = 0;
2982
		goto out_unlock_range;
2983
	}
2984

2985
	/* skip duplicate vm fault on different pages of same range */
2986 2987
	if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
				AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
2988 2989
		pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
			 svms, prange->start, prange->last);
2990
		r = 0;
2991 2992
		goto out_unlock_range;
	}
2993

2994 2995 2996
	/* __do_munmap removed VMA, return success as we are handling stale
	 * retry fault.
	 */
2997 2998
	vma = vma_lookup(mm, addr << PAGE_SHIFT);
	if (!vma) {
2999 3000 3001 3002 3003 3004
		pr_debug("address 0x%llx VMA is removed\n", addr);
		r = 0;
		goto out_unlock_range;
	}

	if (!svm_fault_allowed(vma, write_fault)) {
3005 3006 3007 3008 3009 3010
		pr_debug("fault addr 0x%llx no %s permission\n", addr,
			write_fault ? "write" : "read");
		r = -EPERM;
		goto out_unlock_range;
	}

3011
	best_loc = svm_range_best_restore_location(prange, node, &gpuidx);
3012 3013
	if (best_loc == -1) {
		pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
3014
			 svms, prange->start, prange->last);
3015 3016 3017 3018 3019 3020 3021 3022
		r = -EACCES;
		goto out_unlock_range;
	}

	pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n",
		 svms, prange->start, prange->last, best_loc,
		 prange->actual_loc);

3023
	kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
3024 3025
				       write_fault, timestamp);

3026 3027 3028 3029 3030
	/* Align migration range start and size to granularity size */
	size = 1UL << prange->granularity;
	start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start);
	last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last);
	if (prange->actual_loc != 0 || best_loc != 0) {
3031
		migration = true;
3032

3033
		if (best_loc) {
3034 3035
			r = svm_migrate_to_vram(prange, best_loc, start, last,
					mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU);
3036 3037 3038 3039 3040 3041
			if (r) {
				pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n",
					 r, addr);
				/* Fallback to system memory if migration to
				 * VRAM failed
				 */
3042 3043 3044
				if (prange->actual_loc && prange->actual_loc != best_loc)
					r = svm_migrate_vram_to_ram(prange, mm, start, last,
						KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL);
3045 3046 3047 3048
				else
					r = 0;
			}
		} else {
3049 3050
			r = svm_migrate_vram_to_ram(prange, mm, start, last,
					KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL);
3051 3052 3053
		}
		if (r) {
			pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n",
3054
				 r, svms, start, last);
3055 3056 3057 3058
			goto out_unlock_range;
		}
	}

3059 3060
	r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false,
				       false, false);
3061 3062
	if (r)
		pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
3063
			 r, svms, start, last);
3064

3065
	kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr,
3066 3067
				     migration);

3068
out_unlock_range:
3069 3070 3071 3072
	mutex_unlock(&prange->migrate_mutex);
out_unlock_svms:
	mutex_unlock(&svms->lock);
	mmap_read_unlock(mm);
3073

3074
	svm_range_count_fault(node, p, gpuidx);
3075

3076 3077 3078 3079
	mmput(mm);
out:
	kfd_unref_process(p);

3080 3081
	if (r == -EAGAIN) {
		pr_debug("recover vm fault later\n");
3082
		amdgpu_gmc_filter_faults_remove(node->adev, addr, pasid);
3083 3084
		r = 0;
	}
3085 3086 3087
	return r;
}

3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105
int
svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled)
{
	struct svm_range *prange, *pchild;
	uint64_t reserved_size = 0;
	uint64_t size;
	int r = 0;

	pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled);

	mutex_lock(&p->svms.lock);

	list_for_each_entry(prange, &p->svms.list, list) {
		svm_range_lock(prange);
		list_for_each_entry(pchild, &prange->child_list, child_list) {
			size = (pchild->last - pchild->start + 1) << PAGE_SHIFT;
			if (xnack_enabled) {
				amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
3106
					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3107 3108
			} else {
				r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
3109
					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3110 3111 3112 3113 3114 3115 3116 3117 3118
				if (r)
					goto out_unlock;
				reserved_size += size;
			}
		}

		size = (prange->last - prange->start + 1) << PAGE_SHIFT;
		if (xnack_enabled) {
			amdgpu_amdkfd_unreserve_mem_limit(NULL, size,
3119
					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3120 3121
		} else {
			r = amdgpu_amdkfd_reserve_mem_limit(NULL, size,
3122
					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134
			if (r)
				goto out_unlock;
			reserved_size += size;
		}
out_unlock:
		svm_range_unlock(prange);
		if (r)
			break;
	}

	if (r)
		amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
3135
					KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, 0);
3136 3137 3138 3139 3140 3141 3142 3143 3144 3145
	else
		/* Change xnack mode must be inside svms lock, to avoid race with
		 * svm_range_deferred_list_work unreserve memory in parallel.
		 */
		p->xnack_enabled = xnack_enabled;

	mutex_unlock(&p->svms.lock);
	return r;
}

3146 3147
void svm_range_list_fini(struct kfd_process *p)
{
3148 3149
	struct svm_range *prange;
	struct svm_range *next;
3150 3151

	pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
3152

3153 3154
	cancel_delayed_work_sync(&p->svms.restore_work);

3155 3156
	/* Ensure list work is finished before process is destroyed */
	flush_work(&p->svms.deferred_list_work);
3157

3158 3159 3160 3161
	/*
	 * Ensure no retry fault comes in afterwards, as page fault handler will
	 * not find kfd process and take mm lock to recover fault.
	 */
3162
	atomic_inc(&p->svms.drain_pagefaults);
3163 3164
	svm_range_drain_retry_fault(&p->svms);

3165 3166 3167
	list_for_each_entry_safe(prange, next, &p->svms.list, list) {
		svm_range_unlink(prange);
		svm_range_remove_notifier(prange);
3168
		svm_range_free(prange, true);
3169 3170 3171 3172 3173
	}

	mutex_destroy(&p->svms.lock);

	pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
3174 3175 3176 3177 3178
}

int svm_range_list_init(struct kfd_process *p)
{
	struct svm_range_list *svms = &p->svms;
3179
	int i;
3180 3181 3182 3183

	svms->objects = RB_ROOT_CACHED;
	mutex_init(&svms->lock);
	INIT_LIST_HEAD(&svms->list);
3184
	atomic_set(&svms->evicted_ranges, 0);
3185
	atomic_set(&svms->drain_pagefaults, 0);
3186
	INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
3187 3188
	INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
	INIT_LIST_HEAD(&svms->deferred_range_list);
3189
	INIT_LIST_HEAD(&svms->criu_svm_metadata_list);
3190
	spin_lock_init(&svms->deferred_list_lock);
3191

3192
	for (i = 0; i < p->n_pdds; i++)
3193
		if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev))
3194 3195
			bitmap_set(svms->bitmap_supported, i, 1);

3196 3197 3198
	return 0;
}

3199 3200 3201 3202 3203
/**
 * svm_range_check_vm - check if virtual address range mapped already
 * @p: current kfd_process
 * @start: range start address, in pages
 * @last: range last address, in pages
3204 3205
 * @bo_s: mapping start address in pages if address range already mapped
 * @bo_l: mapping last address in pages if address range already mapped
3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219
 *
 * The purpose is to avoid virtual address ranges already allocated by
 * kfd_ioctl_alloc_memory_of_gpu ioctl.
 * It looks for each pdd in the kfd_process.
 *
 * Context: Process context
 *
 * Return 0 - OK, if the range is not mapped.
 * Otherwise error code:
 * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu
 * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by
 * a signal. Release all buffer reservations and return to user-space.
 */
static int
3220 3221
svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last,
		   uint64_t *bo_s, uint64_t *bo_l)
3222
{
3223 3224
	struct amdgpu_bo_va_mapping *mapping;
	struct interval_tree_node *node;
3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237
	uint32_t i;
	int r;

	for (i = 0; i < p->n_pdds; i++) {
		struct amdgpu_vm *vm;

		if (!p->pdds[i]->drm_priv)
			continue;

		vm = drm_priv_to_vm(p->pdds[i]->drm_priv);
		r = amdgpu_bo_reserve(vm->root.bo, false);
		if (r)
			return r;
3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248

		node = interval_tree_iter_first(&vm->va, start, last);
		if (node) {
			pr_debug("range [0x%llx 0x%llx] already TTM mapped\n",
				 start, last);
			mapping = container_of((struct rb_node *)node,
					       struct amdgpu_bo_va_mapping, rb);
			if (bo_s && bo_l) {
				*bo_s = mapping->start;
				*bo_l = mapping->last;
			}
3249 3250 3251 3252 3253 3254 3255 3256 3257
			amdgpu_bo_unreserve(vm->root.bo);
			return -EADDRINUSE;
		}
		amdgpu_bo_unreserve(vm->root.bo);
	}

	return 0;
}

3258 3259
/**
 * svm_range_is_valid - check if virtual address range is valid
3260
 * @p: current kfd_process
3261 3262 3263 3264 3265 3266 3267 3268
 * @start: range start address, in pages
 * @size: range size, in pages
 *
 * Valid virtual address range means it belongs to one or more VMAs
 *
 * Context: Process context
 *
 * Return:
3269
 *  0 - OK, otherwise error code
3270
 */
3271 3272
static int
svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size)
3273 3274 3275 3276
{
	const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
	struct vm_area_struct *vma;
	unsigned long end;
3277
	unsigned long start_unchg = start;
3278 3279 3280 3281

	start <<= PAGE_SHIFT;
	end = start + (size << PAGE_SHIFT);
	do {
3282 3283
		vma = vma_lookup(p->mm, start);
		if (!vma || (vma->vm_flags & device_vma))
3284
			return -EFAULT;
3285 3286 3287
		start = min(end, vma->vm_end);
	} while (start < end);

3288 3289
	return svm_range_check_vm(p, start_unchg, (end - 1) >> PAGE_SHIFT, NULL,
				  NULL);
3290 3291
}

3292 3293
/**
 * svm_range_best_prefetch_location - decide the best prefetch location
3294 3295 3296
 * @prange: svm range structure
 *
 * For xnack off:
3297
 * If range map to single GPU, the best prefetch location is prefetch_loc, which
3298 3299
 * can be CPU or GPU.
 *
3300 3301 3302 3303
 * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on
 * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise
 * the best prefetch location is always CPU, because GPU can not have coherent
 * mapping VRAM of other GPUs even with large-BAR PCIe connection.
3304 3305
 *
 * For xnack on:
3306 3307 3308 3309 3310 3311
 * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is
 * prefetch_loc, other GPU access will generate vm fault and trigger migration.
 *
 * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same
 * hive, the best prefetch location is prefetch_loc GPU, otherwise the best
 * prefetch location is always CPU.
3312 3313 3314 3315 3316 3317
 *
 * Context: Process context
 *
 * Return:
 * 0 for CPU or GPU id
 */
3318 3319
static uint32_t
svm_range_best_prefetch_location(struct svm_range *prange)
3320 3321 3322 3323
{
	DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE);
	uint32_t best_loc = prange->prefetch_loc;
	struct kfd_process_device *pdd;
3324
	struct kfd_node *bo_node;
3325 3326 3327 3328 3329 3330 3331 3332
	struct kfd_process *p;
	uint32_t gpuidx;

	p = container_of(prange->svms, struct kfd_process, svms);

	if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED)
		goto out;

3333 3334 3335
	bo_node = svm_range_get_node_by_id(prange, best_loc);
	if (!bo_node) {
		WARN_ONCE(1, "failed to get valid kfd node at id%x\n", best_loc);
3336 3337 3338
		best_loc = 0;
		goto out;
	}
3339

3340
	if (bo_node->adev->flags & AMD_IS_APU) {
3341 3342 3343 3344
		best_loc = 0;
		goto out;
	}

3345 3346 3347 3348 3349
	if (p->xnack_enabled)
		bitmap_copy(bitmap, prange->bitmap_aip, MAX_GPU_INSTANCE);
	else
		bitmap_or(bitmap, prange->bitmap_access, prange->bitmap_aip,
			  MAX_GPU_INSTANCE);
3350 3351 3352 3353 3354 3355 3356 3357

	for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) {
		pdd = kfd_process_device_from_gpuidx(p, gpuidx);
		if (!pdd) {
			pr_debug("failed to get device by idx 0x%x\n", gpuidx);
			continue;
		}

3358
		if (pdd->dev->adev == bo_node->adev)
3359 3360
			continue;

3361
		if (!svm_nodes_in_same_hive(pdd->dev, bo_node)) {
3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406
			best_loc = 0;
			break;
		}
	}

out:
	pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n",
		 p->xnack_enabled, &p->svms, prange->start, prange->last,
		 best_loc);

	return best_loc;
}

/* svm_range_trigger_migration - start page migration if prefetch loc changed
 * @mm: current process mm_struct
 * @prange: svm range structure
 * @migrated: output, true if migration is triggered
 *
 * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range
 * from ram to vram.
 * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range
 * from vram to ram.
 *
 * If GPU vm fault retry is not enabled, migration interact with MMU notifier
 * and restore work:
 * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict
 *    stops all queues, schedule restore work
 * 2. svm_range_restore_work wait for migration is done by
 *    a. svm_range_validate_vram takes prange->migrate_mutex
 *    b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns
 * 3. restore work update mappings of GPU, resume all queues.
 *
 * Context: Process context
 *
 * Return:
 * 0 - OK, otherwise - error code of migration
 */
static int
svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange,
			    bool *migrated)
{
	uint32_t best_loc;
	int r = 0;

	*migrated = false;
3407
	best_loc = svm_range_best_prefetch_location(prange);
3408

3409 3410 3411 3412 3413 3414 3415
	/* when best_loc is a gpu node and same as prange->actual_loc
	 * we still need do migration as prange->actual_loc !=0 does
	 * not mean all pages in prange are vram. hmm migrate will pick
	 * up right pages during migration.
	 */
	if ((best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) ||
	    (best_loc == 0 && prange->actual_loc == 0))
3416 3417
		return 0;

3418
	if (!best_loc) {
3419
		r = svm_migrate_vram_to_ram(prange, mm, prange->start, prange->last,
3420
					KFD_MIGRATE_TRIGGER_PREFETCH, NULL);
3421 3422
		*migrated = !r;
		return r;
3423 3424
	}

3425 3426
	r = svm_migrate_to_vram(prange, best_loc, prange->start, prange->last,
				mm, KFD_MIGRATE_TRIGGER_PREFETCH);
3427
	*migrated = !r;
3428

3429
	return 0;
3430 3431
}

3432 3433
int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence)
{
3434 3435 3436 3437 3438 3439 3440 3441
	/* Dereferencing fence->svm_bo is safe here because the fence hasn't
	 * signaled yet and we're under the protection of the fence->lock.
	 * After the fence is signaled in svm_range_bo_release, we cannot get
	 * here any more.
	 *
	 * Reference is dropped in svm_range_evict_svm_bo_worker.
	 */
	if (svm_bo_ref_unless_zero(fence->svm_bo)) {
3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452
		WRITE_ONCE(fence->svm_bo->evicting, 1);
		schedule_work(&fence->svm_bo->eviction_work);
	}

	return 0;
}

static void svm_range_evict_svm_bo_worker(struct work_struct *work)
{
	struct svm_range_bo *svm_bo;
	struct mm_struct *mm;
3453
	int r = 0;
3454 3455 3456

	svm_bo = container_of(work, struct svm_range_bo, eviction_work);

3457 3458 3459 3460
	if (mmget_not_zero(svm_bo->eviction_fence->mm)) {
		mm = svm_bo->eviction_fence->mm;
	} else {
		svm_range_bo_unref(svm_bo);
3461
		return;
3462
	}
3463 3464 3465

	mmap_read_lock(mm);
	spin_lock(&svm_bo->list_lock);
3466
	while (!list_empty(&svm_bo->range_list) && !r) {
3467 3468 3469
		struct svm_range *prange =
				list_first_entry(&svm_bo->range_list,
						struct svm_range, svm_bo_list);
3470 3471
		int retries = 3;

3472 3473 3474 3475 3476 3477 3478
		list_del_init(&prange->svm_bo_list);
		spin_unlock(&svm_bo->list_lock);

		pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms,
			 prange->start, prange->last);

		mutex_lock(&prange->migrate_mutex);
3479
		do {
3480 3481 3482
			/* migrate all vram pages in this prange to sys ram
			 * after that prange->actual_loc should be zero
			 */
3483
			r = svm_migrate_vram_to_ram(prange, mm,
3484
					prange->start, prange->last,
3485
					KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL);
3486
		} while (!r && prange->actual_loc && --retries);
3487

3488 3489
		if (!r && prange->actual_loc)
			pr_info_once("Migration failed during eviction");
3490

3491 3492 3493 3494 3495
		if (!prange->actual_loc) {
			mutex_lock(&prange->lock);
			prange->svm_bo = NULL;
			mutex_unlock(&prange->lock);
		}
3496 3497 3498 3499 3500 3501
		mutex_unlock(&prange->migrate_mutex);

		spin_lock(&svm_bo->list_lock);
	}
	spin_unlock(&svm_bo->list_lock);
	mmap_read_unlock(mm);
3502
	mmput(mm);
3503 3504

	dma_fence_signal(&svm_bo->eviction_fence->base);
3505

3506 3507 3508
	/* This is the last reference to svm_bo, after svm_range_vram_node_free
	 * has been called in svm_migrate_vram_to_ram
	 */
3509
	WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n");
3510 3511 3512
	svm_range_bo_unref(svm_bo);
}

3513
static int
3514 3515 3516
svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
		   uint64_t start, uint64_t size, uint32_t nattr,
		   struct kfd_ioctl_svm_attribute *attrs)
3517
{
3518
	struct amdkfd_process_info *process_info = p->kgd_process_info;
3519 3520 3521
	struct list_head update_list;
	struct list_head insert_list;
	struct list_head remove_list;
3522
	struct list_head remap_list;
3523 3524 3525
	struct svm_range_list *svms;
	struct svm_range *prange;
	struct svm_range *next;
3526 3527
	bool update_mapping = false;
	bool flush_tlb;
3528
	int r, ret = 0;
3529 3530 3531 3532 3533 3534 3535 3536 3537 3538

	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
		 p->pasid, &p->svms, start, start + size - 1, size);

	r = svm_range_check_attr(p, nattr, attrs);
	if (r)
		return r;

	svms = &p->svms;

3539 3540
	mutex_lock(&process_info->lock);

3541
	svm_range_list_lock_and_flush_work(svms, mm);
3542

3543 3544 3545
	r = svm_range_is_valid(p, start, size);
	if (r) {
		pr_debug("invalid range r=%d\n", r);
3546 3547 3548 3549 3550 3551 3552 3553
		mmap_write_unlock(mm);
		goto out;
	}

	mutex_lock(&svms->lock);

	/* Add new range and split existing ranges as needed */
	r = svm_range_add(p, start, size, nattr, attrs, &update_list,
3554
			  &insert_list, &remove_list, &remap_list);
3555 3556 3557 3558 3559 3560
	if (r) {
		mutex_unlock(&svms->lock);
		mmap_write_unlock(mm);
		goto out;
	}
	/* Apply changes as a transaction */
3561
	list_for_each_entry_safe(prange, next, &insert_list, list) {
3562
		svm_range_add_to_svms(prange);
3563
		svm_range_add_notifier_locked(mm, prange);
3564 3565
	}
	list_for_each_entry(prange, &update_list, update_list) {
3566
		svm_range_apply_attrs(p, prange, nattr, attrs, &update_mapping);
3567 3568
		/* TODO: unmap ranges from GPU that lost access */
	}
3569
	list_for_each_entry_safe(prange, next, &remove_list, update_list) {
3570 3571 3572 3573
		pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n",
			 prange->svms, prange, prange->start,
			 prange->last);
		svm_range_unlink(prange);
3574
		svm_range_remove_notifier(prange);
3575
		svm_range_free(prange, false);
3576 3577 3578 3579 3580 3581 3582 3583 3584
	}

	mmap_write_downgrade(mm);
	/* Trigger migrations and revalidate and map to GPUs as needed. If
	 * this fails we may be left with partially completed actions. There
	 * is no clean way of rolling back to the previous state in such a
	 * case because the rollback wouldn't be guaranteed to work either.
	 */
	list_for_each_entry(prange, &update_list, update_list) {
3585 3586 3587 3588 3589 3590 3591 3592
		bool migrated;

		mutex_lock(&prange->migrate_mutex);

		r = svm_range_trigger_migration(mm, prange, &migrated);
		if (r)
			goto out_unlock_range;

3593
		if (migrated && (!p->xnack_enabled ||
3594 3595
		    (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) &&
		    prange->mapped_to_gpu) {
3596 3597 3598 3599 3600
			pr_debug("restore_work will update mappings of GPUs\n");
			mutex_unlock(&prange->migrate_mutex);
			continue;
		}

3601 3602 3603 3604 3605 3606 3607
		if (!migrated && !update_mapping) {
			mutex_unlock(&prange->migrate_mutex);
			continue;
		}

		flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu;

3608 3609
		r = svm_range_validate_and_map(mm, prange->start, prange->last, prange,
					       MAX_GPU_INSTANCE, true, true, flush_tlb);
3610 3611 3612 3613 3614 3615
		if (r)
			pr_debug("failed %d to map svm range\n", r);

out_unlock_range:
		mutex_unlock(&prange->migrate_mutex);
		if (r)
3616
			ret = r;
3617 3618
	}

3619 3620 3621 3622
	list_for_each_entry(prange, &remap_list, update_list) {
		pr_debug("Remapping prange 0x%p [0x%lx 0x%lx]\n",
			 prange, prange->start, prange->last);
		mutex_lock(&prange->migrate_mutex);
3623 3624
		r = svm_range_validate_and_map(mm,  prange->start, prange->last, prange,
					       MAX_GPU_INSTANCE, true, true, prange->mapped_to_gpu);
3625 3626 3627 3628 3629 3630 3631
		if (r)
			pr_debug("failed %d on remap svm range\n", r);
		mutex_unlock(&prange->migrate_mutex);
		if (r)
			ret = r;
	}

3632
	dynamic_svm_range_dump(svms);
3633 3634 3635 3636

	mutex_unlock(&svms->lock);
	mmap_read_unlock(mm);
out:
3637 3638
	mutex_unlock(&process_info->lock);

3639 3640 3641
	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
		 &p->svms, start, start + size - 1, r);

3642
	return ret ? ret : r;
3643 3644
}

3645
static int
3646 3647 3648
svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
		   uint64_t start, uint64_t size, uint32_t nattr,
		   struct kfd_ioctl_svm_attribute *attrs)
3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663
{
	DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
	DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
	bool get_preferred_loc = false;
	bool get_prefetch_loc = false;
	bool get_granularity = false;
	bool get_accessible = false;
	bool get_flags = false;
	uint64_t last = start + size - 1UL;
	uint8_t granularity = 0xff;
	struct interval_tree_node *node;
	struct svm_range_list *svms;
	struct svm_range *prange;
	uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
	uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
3664 3665
	uint32_t flags_and = 0xffffffff;
	uint32_t flags_or = 0;
3666 3667
	int gpuidx;
	uint32_t i;
3668
	int r = 0;
3669 3670 3671 3672

	pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n", &p->svms, start,
		 start + size - 1, nattr);

3673 3674 3675 3676 3677 3678 3679 3680
	/* Flush pending deferred work to avoid racing with deferred actions from
	 * previous memory map changes (e.g. munmap). Concurrent memory map changes
	 * can still race with get_attr because we don't hold the mmap lock. But that
	 * would be a race condition in the application anyway, and undefined
	 * behaviour is acceptable in that case.
	 */
	flush_work(&p->svms.deferred_list_work);

3681
	mmap_read_lock(mm);
3682
	r = svm_range_is_valid(p, start, size);
3683
	mmap_read_unlock(mm);
3684 3685 3686 3687
	if (r) {
		pr_debug("invalid range r=%d\n", r);
		return r;
	}
3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700

	for (i = 0; i < nattr; i++) {
		switch (attrs[i].type) {
		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
			get_preferred_loc = true;
			break;
		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
			get_prefetch_loc = true;
			break;
		case KFD_IOCTL_SVM_ATTR_ACCESS:
			get_accessible = true;
			break;
		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3701
		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723
			get_flags = true;
			break;
		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
			get_granularity = true;
			break;
		case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE:
		case KFD_IOCTL_SVM_ATTR_NO_ACCESS:
			fallthrough;
		default:
			pr_debug("get invalid attr type 0x%x\n", attrs[i].type);
			return -EINVAL;
		}
	}

	svms = &p->svms;

	mutex_lock(&svms->lock);

	node = interval_tree_iter_first(&svms->objects, start, last);
	if (!node) {
		pr_debug("range attrs not found return default values\n");
		svm_range_set_default_attributes(&location, &prefetch_loc,
3724 3725
						 &granularity, &flags_and);
		flags_or = flags_and;
3726
		if (p->xnack_enabled)
3727 3728
			bitmap_copy(bitmap_access, svms->bitmap_supported,
				    MAX_GPU_INSTANCE);
3729 3730 3731
		else
			bitmap_zero(bitmap_access, MAX_GPU_INSTANCE);
		bitmap_zero(bitmap_aip, MAX_GPU_INSTANCE);
3732 3733
		goto fill_values;
	}
3734 3735
	bitmap_copy(bitmap_access, svms->bitmap_supported, MAX_GPU_INSTANCE);
	bitmap_copy(bitmap_aip, svms->bitmap_supported, MAX_GPU_INSTANCE);
3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770

	while (node) {
		struct interval_tree_node *next;

		prange = container_of(node, struct svm_range, it_node);
		next = interval_tree_iter_next(node, start, last);

		if (get_preferred_loc) {
			if (prange->preferred_loc ==
					KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
			    (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
			     location != prange->preferred_loc)) {
				location = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
				get_preferred_loc = false;
			} else {
				location = prange->preferred_loc;
			}
		}
		if (get_prefetch_loc) {
			if (prange->prefetch_loc ==
					KFD_IOCTL_SVM_LOCATION_UNDEFINED ||
			    (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED &&
			     prefetch_loc != prange->prefetch_loc)) {
				prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED;
				get_prefetch_loc = false;
			} else {
				prefetch_loc = prange->prefetch_loc;
			}
		}
		if (get_accessible) {
			bitmap_and(bitmap_access, bitmap_access,
				   prange->bitmap_access, MAX_GPU_INSTANCE);
			bitmap_and(bitmap_aip, bitmap_aip,
				   prange->bitmap_aip, MAX_GPU_INSTANCE);
		}
3771 3772 3773 3774
		if (get_flags) {
			flags_and &= prange->flags;
			flags_or |= prange->flags;
		}
3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807

		if (get_granularity && prange->granularity < granularity)
			granularity = prange->granularity;

		node = next;
	}
fill_values:
	mutex_unlock(&svms->lock);

	for (i = 0; i < nattr; i++) {
		switch (attrs[i].type) {
		case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC:
			attrs[i].value = location;
			break;
		case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
			attrs[i].value = prefetch_loc;
			break;
		case KFD_IOCTL_SVM_ATTR_ACCESS:
			gpuidx = kfd_process_gpuidx_from_gpuid(p,
							       attrs[i].value);
			if (gpuidx < 0) {
				pr_debug("invalid gpuid %x\n", attrs[i].value);
				return -EINVAL;
			}
			if (test_bit(gpuidx, bitmap_access))
				attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS;
			else if (test_bit(gpuidx, bitmap_aip))
				attrs[i].type =
					KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE;
			else
				attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS;
			break;
		case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
3808 3809 3810 3811
			attrs[i].value = flags_and;
			break;
		case KFD_IOCTL_SVM_ATTR_CLR_FLAGS:
			attrs[i].value = ~flags_or;
3812 3813 3814 3815 3816 3817 3818 3819 3820 3821
			break;
		case KFD_IOCTL_SVM_ATTR_GRANULARITY:
			attrs[i].value = (uint32_t)granularity;
			break;
		}
	}

	return 0;
}

3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852
int kfd_criu_resume_svm(struct kfd_process *p)
{
	struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL;
	int nattr_common = 4, nattr_accessibility = 1;
	struct criu_svm_metadata *criu_svm_md = NULL;
	struct svm_range_list *svms = &p->svms;
	struct criu_svm_metadata *next = NULL;
	uint32_t set_flags = 0xffffffff;
	int i, j, num_attrs, ret = 0;
	uint64_t set_attr_size;
	struct mm_struct *mm;

	if (list_empty(&svms->criu_svm_metadata_list)) {
		pr_debug("No SVM data from CRIU restore stage 2\n");
		return ret;
	}

	mm = get_task_mm(p->lead_thread);
	if (!mm) {
		pr_err("failed to get mm for the target process\n");
		return -ESRCH;
	}

	num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);

	i = j = 0;
	list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
		pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n",
			 i, criu_svm_md->data.start_addr, criu_svm_md->data.size);

		for (j = 0; j < num_attrs; j++) {
3853
			pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924
				 i, j, criu_svm_md->data.attrs[j].type,
				 i, j, criu_svm_md->data.attrs[j].value);
			switch (criu_svm_md->data.attrs[j].type) {
			/* During Checkpoint operation, the query for
			 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might
			 * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were
			 * not used by the range which was checkpointed. Care
			 * must be taken to not restore with an invalid value
			 * otherwise the gpuidx value will be invalid and
			 * set_attr would eventually fail so just replace those
			 * with another dummy attribute such as
			 * KFD_IOCTL_SVM_ATTR_SET_FLAGS.
			 */
			case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC:
				if (criu_svm_md->data.attrs[j].value ==
				    KFD_IOCTL_SVM_LOCATION_UNDEFINED) {
					criu_svm_md->data.attrs[j].type =
						KFD_IOCTL_SVM_ATTR_SET_FLAGS;
					criu_svm_md->data.attrs[j].value = 0;
				}
				break;
			case KFD_IOCTL_SVM_ATTR_SET_FLAGS:
				set_flags = criu_svm_md->data.attrs[j].value;
				break;
			default:
				break;
			}
		}

		/* CLR_FLAGS is not available via get_attr during checkpoint but
		 * it needs to be inserted before restoring the ranges so
		 * allocate extra space for it before calling set_attr
		 */
		set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
						(num_attrs + 1);
		set_attr_new = krealloc(set_attr, set_attr_size,
					    GFP_KERNEL);
		if (!set_attr_new) {
			ret = -ENOMEM;
			goto exit;
		}
		set_attr = set_attr_new;

		memcpy(set_attr, criu_svm_md->data.attrs, num_attrs *
					sizeof(struct kfd_ioctl_svm_attribute));
		set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS;
		set_attr[num_attrs].value = ~set_flags;

		ret = svm_range_set_attr(p, mm, criu_svm_md->data.start_addr,
					 criu_svm_md->data.size, num_attrs + 1,
					 set_attr);
		if (ret) {
			pr_err("CRIU: failed to set range attributes\n");
			goto exit;
		}

		i++;
	}
exit:
	kfree(set_attr);
	list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) {
		pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
						criu_svm_md->data.start_addr);
		kfree(criu_svm_md);
	}

	mmput(mm);
	return ret;

}

3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939
int kfd_criu_restore_svm(struct kfd_process *p,
			 uint8_t __user *user_priv_ptr,
			 uint64_t *priv_data_offset,
			 uint64_t max_priv_data_size)
{
	uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size;
	int nattr_common = 4, nattr_accessibility = 1;
	struct criu_svm_metadata *criu_svm_md = NULL;
	struct svm_range_list *svms = &p->svms;
	uint32_t num_devices;
	int ret = 0;

	num_devices = p->n_pdds;
	/* Handle one SVM range object at a time, also the number of gpus are
	 * assumed to be same on the restore node, checking must be done while
3940 3941
	 * evaluating the topology earlier
	 */
3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977

	svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) *
		(nattr_common + nattr_accessibility * num_devices);
	svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size;

	svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) +
								svm_attrs_size;

	criu_svm_md = kzalloc(svm_object_md_size, GFP_KERNEL);
	if (!criu_svm_md) {
		pr_err("failed to allocate memory to store svm metadata\n");
		return -ENOMEM;
	}
	if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) {
		ret = -EINVAL;
		goto exit;
	}

	ret = copy_from_user(&criu_svm_md->data, user_priv_ptr + *priv_data_offset,
			     svm_priv_data_size);
	if (ret) {
		ret = -EFAULT;
		goto exit;
	}
	*priv_data_offset += svm_priv_data_size;

	list_add_tail(&criu_svm_md->list, &svms->criu_svm_metadata_list);

	return 0;


exit:
	kfree(criu_svm_md);
	return ret;
}

3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036
int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
		       uint64_t *svm_priv_data_size)
{
	uint64_t total_size, accessibility_size, common_attr_size;
	int nattr_common = 4, nattr_accessibility = 1;
	int num_devices = p->n_pdds;
	struct svm_range_list *svms;
	struct svm_range *prange;
	uint32_t count = 0;

	*svm_priv_data_size = 0;

	svms = &p->svms;
	if (!svms)
		return -EINVAL;

	mutex_lock(&svms->lock);
	list_for_each_entry(prange, &svms->list, list) {
		pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n",
			 prange, prange->start, prange->npages,
			 prange->start + prange->npages - 1);
		count++;
	}
	mutex_unlock(&svms->lock);

	*num_svm_ranges = count;
	/* Only the accessbility attributes need to be queried for all the gpus
	 * individually, remaining ones are spanned across the entire process
	 * regardless of the various gpu nodes. Of the remaining attributes,
	 * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved.
	 *
	 * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC
	 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC
	 * KFD_IOCTL_SVM_ATTR_SET_FLAGS
	 * KFD_IOCTL_SVM_ATTR_GRANULARITY
	 *
	 * ** ACCESSBILITY ATTRIBUTES **
	 * (Considered as one, type is altered during query, value is gpuid)
	 * KFD_IOCTL_SVM_ATTR_ACCESS
	 * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
	 * KFD_IOCTL_SVM_ATTR_NO_ACCESS
	 */
	if (*num_svm_ranges > 0) {
		common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
			nattr_common;
		accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) *
			nattr_accessibility * num_devices;

		total_size = sizeof(struct kfd_criu_svm_range_priv_data) +
			common_attr_size + accessibility_size;

		*svm_priv_data_size = *num_svm_ranges * total_size;
	}

	pr_debug("num_svm_ranges %u total_priv_size %llu\n", *num_svm_ranges,
		 *svm_priv_data_size);
	return 0;
}

4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110
int kfd_criu_checkpoint_svm(struct kfd_process *p,
			    uint8_t __user *user_priv_data,
			    uint64_t *priv_data_offset)
{
	struct kfd_criu_svm_range_priv_data *svm_priv = NULL;
	struct kfd_ioctl_svm_attribute *query_attr = NULL;
	uint64_t svm_priv_data_size, query_attr_size = 0;
	int index, nattr_common = 4, ret = 0;
	struct svm_range_list *svms;
	int num_devices = p->n_pdds;
	struct svm_range *prange;
	struct mm_struct *mm;

	svms = &p->svms;
	if (!svms)
		return -EINVAL;

	mm = get_task_mm(p->lead_thread);
	if (!mm) {
		pr_err("failed to get mm for the target process\n");
		return -ESRCH;
	}

	query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
				(nattr_common + num_devices);

	query_attr = kzalloc(query_attr_size, GFP_KERNEL);
	if (!query_attr) {
		ret = -ENOMEM;
		goto exit;
	}

	query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC;
	query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC;
	query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS;
	query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY;

	for (index = 0; index < num_devices; index++) {
		struct kfd_process_device *pdd = p->pdds[index];

		query_attr[index + nattr_common].type =
			KFD_IOCTL_SVM_ATTR_ACCESS;
		query_attr[index + nattr_common].value = pdd->user_gpu_id;
	}

	svm_priv_data_size = sizeof(*svm_priv) + query_attr_size;

	svm_priv = kzalloc(svm_priv_data_size, GFP_KERNEL);
	if (!svm_priv) {
		ret = -ENOMEM;
		goto exit_query;
	}

	index = 0;
	list_for_each_entry(prange, &svms->list, list) {

		svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE;
		svm_priv->start_addr = prange->start;
		svm_priv->size = prange->npages;
		memcpy(&svm_priv->attrs, query_attr, query_attr_size);
		pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n",
			 prange, prange->start, prange->npages,
			 prange->start + prange->npages - 1,
			 prange->npages * PAGE_SIZE);

		ret = svm_range_get_attr(p, mm, svm_priv->start_addr,
					 svm_priv->size,
					 (nattr_common + num_devices),
					 svm_priv->attrs);
		if (ret) {
			pr_err("CRIU: failed to obtain range attributes\n");
			goto exit_priv;
		}

4111 4112
		if (copy_to_user(user_priv_data + *priv_data_offset, svm_priv,
				 svm_priv_data_size)) {
4113
			pr_err("Failed to copy svm priv to user\n");
4114
			ret = -EFAULT;
4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131
			goto exit_priv;
		}

		*priv_data_offset += svm_priv_data_size;

	}


exit_priv:
	kfree(svm_priv);
exit_query:
	kfree(query_attr);
exit:
	mmput(mm);
	return ret;
}

4132 4133 4134 4135
int
svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start,
	  uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs)
{
4136
	struct mm_struct *mm = current->mm;
4137 4138 4139 4140 4141 4142 4143
	int r;

	start >>= PAGE_SHIFT;
	size >>= PAGE_SHIFT;

	switch (op) {
	case KFD_IOCTL_SVM_OP_SET_ATTR:
4144
		r = svm_range_set_attr(p, mm, start, size, nattrs, attrs);
4145
		break;
4146
	case KFD_IOCTL_SVM_OP_GET_ATTR:
4147
		r = svm_range_get_attr(p, mm, start, size, nattrs, attrs);
4148
		break;
4149 4150 4151 4152 4153 4154 4155
	default:
		r = EINVAL;
		break;
	}

	return r;
}