drm/nouveau/mmu: implement new vmm backend

This is the common code to support a rework of the VMM backends. It adds support for more than 2 levels of page table nesting, which is required to be able to support GP100's MMU layout. Sparse mappings (that don't cause MMU faults when accessed) are now supported, where the backend provides it. Dual-PT handling had to become more sophisticated to support sparse, but this also allows us to support an optimisation the MMU provides on GK104 and newer. Certain operations can now be combined into a single page tree walk to avoid some overhead, but also enables optimsations like skipping PTE unmap writes when the PT will be destroyed anyway. The old backend has been hacked up to forward requests onto the new backend, if present, so that it's possible to bisect between issues in the backend changes vs the upcoming frontend changes. Until the new frontend has been merged, new backends will leak BAR2 page tables on module unload. This is expected, and it's not worth the effort of hacking around this as it doesn't effect runtime. Signed-off-by: Ben Skeggs <bskeggs@redhat.com>

drm/nouveau/mmu: implement new vmm backend
This is the common code to support a rework of the VMM backends. It adds support for more than 2 levels of page table nesting, which is required to be able to support GP100's MMU layout. Sparse mappings (that don't cause MMU faults when accessed) are now supported, where the backend provides it. Dual-PT handling had to become more sophisticated to support sparse, but this also allows us to support an optimisation the MMU provides on GK104 and newer. Certain operations can now be combined into a single page tree walk to avoid some overhead, but also enables optimsations like skipping PTE unmap writes when the PT will be destroyed anyway. The old backend has been hacked up to forward requests onto the new backend, if present, so that it's possible to bisect between issues in the backend changes vs the upcoming frontend changes. Until the new frontend has been merged, new backends will leak BAR2 page tables on module unload. This is expected, and it's not worth the effort of hacking around this as it doesn't effect runtime. Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
eb813999 · Ben Skeggs · bda9e379 · eb813999 · eb813999 · eb813999
Commit eb813999 authored Nov 01, 2017 by Ben Skeggs
5 changed files
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -56,6 +56,13 @@ config NOUVEAU_DEBUG_DEFAULT
 	help
 	  Selects the default debug level

+config NOUVEAU_DEBUG_MMU
+	bool "Enable additional MMU debugging"
+	depends on DRM_NOUVEAU
+	default n
+	help
+	  Say Y here if you want to enable verbose MMU debug output.
+
 config DRM_NOUVEAU_BACKLIGHT
 	bool "Support for backlight control"
 	depends on DRM_NOUVEAU

--- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h
+++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h
 #ifndef __NVKM_MMU_H__
 #define __NVKM_MMU_H__
 #include <core/subdev.h>
+#include <core/memory.h>
 #include <core/mm.h>
 struct nvkm_gpuobj;
 struct nvkm_mem;
@@ -11,6 +12,8 @@ struct nvkm_vm_pgt {
 };

 struct nvkm_vma {
+	struct nvkm_memory *memory;
+	struct nvkm_tags *tags;
 	struct nvkm_vm *vm;
 	struct nvkm_mm_node *node;
 	union {
@@ -24,6 +27,7 @@ struct nvkm_vm {
 	const struct nvkm_vmm_func *func;
 	struct nvkm_mmu *mmu;
 	const char *name;
+	u32 debug;
 	struct kref kref;
 	struct mutex mutex;

@@ -58,6 +62,25 @@ void nvkm_vm_map_at(struct nvkm_vma *, u64 offset, struct nvkm_mem *);
 void nvkm_vm_unmap(struct nvkm_vma *);
 void nvkm_vm_unmap_at(struct nvkm_vma *, u64 offset, u64 length);

+int nvkm_vmm_boot(struct nvkm_vmm *);
+
+struct nvkm_vmm_map {
+	struct nvkm_memory *memory;
+	u64 offset;
+
+	struct nvkm_mm_node *mem;
+	struct scatterlist *sgl;
+	dma_addr_t *dma;
+	u64 off;
+
+	const struct nvkm_vmm_page *page;
+
+	struct nvkm_tags *tags;
+	u64 next;
+	u64 type;
+	u64 ctag;
+};
+
 struct nvkm_mmu {
 	const struct nvkm_mmu_func *func;
 	struct nvkm_subdev subdev;

--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c
@@ -213,6 +213,36 @@ nvkm_mmu_ptc_get(struct nvkm_mmu *mmu, u32 size, u32 align, bool zero)
 	return pt;
 }

+static void
+nvkm_vm_map_(const struct nvkm_vmm_page *page, struct nvkm_vma *vma, u64 delta,
+	     struct nvkm_mem *mem, nvkm_vmm_pte_func fn,
+	     struct nvkm_vmm_map *map)
+{
+	struct nvkm_vmm *vmm = vma->vm;
+	void *argv = NULL;
+	u32 argc = 0;
+	int ret;
+
+	map->memory = mem->memory;
+	map->page = page;
+
+	if (vmm->func->valid) {
+		ret = vmm->func->valid(vmm, argv, argc, map);
+		if (WARN_ON(ret))
+			return;
+	}
+
+	mutex_lock(&vmm->mutex);
+	nvkm_vmm_ptes_map(vmm, page, ((u64)vma->node->offset << 12) + delta,
+				      (u64)vma->node->length << 12, map, fn);
+	mutex_unlock(&vmm->mutex);
+
+	nvkm_memory_tags_put(vma->memory, vmm->mmu->subdev.device, &vma->tags);
+	nvkm_memory_unref(&vma->memory);
+	vma->memory = nvkm_memory_ref(map->memory);
+	vma->tags = map->tags;
+}
+
 void
 nvkm_mmu_ptc_dump(struct nvkm_mmu *mmu)
 {
@@ -251,6 +281,7 @@ nvkm_mmu_ptc_init(struct nvkm_mmu *mmu)
 void
 nvkm_vm_map_at(struct nvkm_vma *vma, u64 delta, struct nvkm_mem *node)
 {
+	const struct nvkm_vmm_page *page = vma->vm->func->page;
 	struct nvkm_vm *vm = vma->vm;
 	struct nvkm_mmu *mmu = vm->mmu;
 	struct nvkm_mm_node *r = node->mem;
@@ -262,6 +293,14 @@ nvkm_vm_map_at(struct nvkm_vma *vma, u64 delta, struct nvkm_mem *node)
 	u32 max  = 1 << (mmu->func->pgt_bits - bits);
 	u32 end, len;

+	if (page->desc->func->unmap) {
+		struct nvkm_vmm_map map = { .mem = node->mem };
+		while (page->shift != vma->node->type)
+			page++;
+		nvkm_vm_map_(page, vma, delta, node, page->desc->func->mem, &map);
+		return;
+	}
+
 	delta = 0;
 	while (r) {
 		u64 phys = (u64)r->offset << 12;
@@ -297,6 +336,7 @@ static void
 nvkm_vm_map_sg_table(struct nvkm_vma *vma, u64 delta, u64 length,
 		     struct nvkm_mem *mem)
 {
+	const struct nvkm_vmm_page *page = vma->vm->func->page;
 	struct nvkm_vm *vm = vma->vm;
 	struct nvkm_mmu *mmu = vm->mmu;
 	int big = vma->node->type != mmu->func->spg_shift;
@@ -311,6 +351,14 @@ nvkm_vm_map_sg_table(struct nvkm_vma *vma, u64 delta, u64 length,
 	int i;
 	struct scatterlist *sg;

+	if (page->desc->func->unmap) {
+		struct nvkm_vmm_map map = { .sgl = mem->sg->sgl };
+		while (page->shift != vma->node->type)
+			page++;
+		nvkm_vm_map_(page, vma, delta, mem, page->desc->func->sgl, &map);
+		return;
+	}
+
 	for_each_sg(mem->sg->sgl, sg, mem->sg->nents, i) {
 		struct nvkm_memory *pgt = vm->pgt[pde].mem[big];
 		sglen = sg_dma_len(sg) >> PAGE_SHIFT;
@@ -355,6 +403,7 @@ static void
 nvkm_vm_map_sg(struct nvkm_vma *vma, u64 delta, u64 length,
 	       struct nvkm_mem *mem)
 {
+	const struct nvkm_vmm_page *page = vma->vm->func->page;
 	struct nvkm_vm *vm = vma->vm;
 	struct nvkm_mmu *mmu = vm->mmu;
 	dma_addr_t *list = mem->pages;
@@ -367,6 +416,14 @@ nvkm_vm_map_sg(struct nvkm_vma *vma, u64 delta, u64 length,
 	u32 max  = 1 << (mmu->func->pgt_bits - bits);
 	u32 end, len;

+	if (page->desc->func->unmap) {
+		struct nvkm_vmm_map map = { .dma = mem->pages };
+		while (page->shift != vma->node->type)
+			page++;
+		nvkm_vm_map_(page, vma, delta, mem, page->desc->func->dma, &map);
+		return;
+	}
+
 	while (num) {
 		struct nvkm_memory *pgt = vm->pgt[pde].mem[big];

@@ -415,6 +472,17 @@ nvkm_vm_unmap_at(struct nvkm_vma *vma, u64 delta, u64 length)
 	u32 max  = 1 << (mmu->func->pgt_bits - bits);
 	u32 end, len;

+	if (vm->func->page->desc->func->unmap) {
+		const struct nvkm_vmm_page *page = vm->func->page;
+		while (page->shift != vma->node->type)
+			page++;
+		mutex_lock(&vm->mutex);
+		nvkm_vmm_ptes_unmap(vm, page, (vma->node->offset << 12) + delta,
+					       vma->node->length << 12, false);
+		mutex_unlock(&vm->mutex);
+		return;
+	}
+
 	while (num) {
 		struct nvkm_memory *pgt = vm->pgt[pde].mem[big];

@@ -440,6 +508,9 @@ void
 nvkm_vm_unmap(struct nvkm_vma *vma)
 {
 	nvkm_vm_unmap_at(vma, 0, (u64)vma->node->length << 12);
+
+	nvkm_memory_tags_put(vma->memory, vma->vm->mmu->subdev.device, &vma->tags);
+	nvkm_memory_unref(&vma->memory);
 }

 static void
@@ -509,6 +580,22 @@ nvkm_vm_get(struct nvkm_vm *vm, u64 size, u32 page_shift, u32 access,
 		return ret;
 	}

+	if (vm->func->page->desc->func->unmap) {
+		const struct nvkm_vmm_page *page = vm->func->page;
+		while (page->shift != page_shift)
+			page++;
+
+		ret = nvkm_vmm_ptes_get(vm, page, vma->node->offset << 12,
+						  vma->node->length << 12);
+		if (ret) {
+			nvkm_mm_free(&vm->mm, &vma->node);
+			mutex_unlock(&vm->mutex);
+			return ret;
+		}
+
+		goto done;
+	}
+
 	fpde = (vma->node->offset >> mmu->func->pgt_bits);
 	lpde = (vma->node->offset + vma->node->length - 1) >> mmu->func->pgt_bits;

@@ -530,8 +617,11 @@ nvkm_vm_get(struct nvkm_vm *vm, u64 size, u32 page_shift, u32 access,
 			return ret;
 		}
 	}
+done:
 	mutex_unlock(&vm->mutex);

+	vma->memory = NULL;
+	vma->tags = NULL;
 	vma->vm = NULL;
 	nvkm_vm_ref(vm, &vma->vm, NULL);
 	vma->offset = (u64)vma->node->offset << 12;
@@ -551,11 +641,25 @@ nvkm_vm_put(struct nvkm_vma *vma)
 	vm = vma->vm;
 	mmu = vm->mmu;

+	nvkm_memory_tags_put(vma->memory, mmu->subdev.device, &vma->tags);
+	nvkm_memory_unref(&vma->memory);
+
 	fpde = (vma->node->offset >> mmu->func->pgt_bits);
 	lpde = (vma->node->offset + vma->node->length - 1) >> mmu->func->pgt_bits;

 	mutex_lock(&vm->mutex);
+	if (vm->func->page->desc->func->unmap) {
+		const struct nvkm_vmm_page *page = vm->func->page;
+		while (page->shift != vma->node->type)
+			page++;
+
+		nvkm_vmm_ptes_put(vm, page, vma->node->offset << 12,
+					    vma->node->length << 12);
+		goto done;
+	}
+
 	nvkm_vm_unmap_pgt(vm, vma->node->type != mmu->func->spg_shift, fpde, lpde);
+done:
 	nvkm_mm_free(&vm->mm, &vma->node);
 	mutex_unlock(&vm->mutex);

@@ -569,6 +673,9 @@ nvkm_vm_boot(struct nvkm_vm *vm, u64 size)
 	struct nvkm_memory *pgt;
 	int ret;

+	if (vm->func->page->desc->func->unmap)
+		return nvkm_vmm_boot(vm);
+
 	ret = nvkm_memory_new(mmu->subdev.device, NVKM_MEM_TARGET_INST,
 			      (size >> mmu->func->spg_shift) * 8, 0x1000, true, &pgt);
 	if (ret == 0) {
@@ -660,7 +767,7 @@ nvkm_vm_ref(struct nvkm_vm *ref, struct nvkm_vm **ptr, struct nvkm_memory *inst)
 			if (ret)
 				return ret;

-			if (ref->mmu->func->map_pgt) {
+			if (!ref->func->page->desc->func->unmap && ref->mmu->func->map_pgt) {
 				for (i = ref->fpde; i <= ref->lpde; i++)
 					ref->mmu->func->map_pgt(ref, i, ref->pgt[i - ref->fpde].mem);
 			}
@@ -672,8 +779,12 @@ nvkm_vm_ref(struct nvkm_vm *ref, struct nvkm_vm **ptr, struct nvkm_memory *inst)
 	if (*ptr) {
 		if ((*ptr)->func->part && inst)
 			(*ptr)->func->part(*ptr, inst);
-		if ((*ptr)->bootstrapped && inst)
+		if ((*ptr)->bootstrapped && inst) {
+			if (!(*ptr)->func->page->desc->func->unmap) {
 				nvkm_memory_unref(&(*ptr)->pgt[0].mem[0]);
+				(*ptr)->bootstrapped = false;
+			}
+		}
 		kref_put(&(*ptr)->refcount, nvkm_vm_del);
 	}


--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c
@@ -67,9 +67,559 @@ nvkm_vmm_pt_new(const struct nvkm_vmm_desc *desc, bool sparse,
 	return pgt;
 }

+struct nvkm_vmm_iter {
+	const struct nvkm_vmm_page *page;
+	const struct nvkm_vmm_desc *desc;
+	struct nvkm_vmm *vmm;
+	u64 cnt;
+	u16 max, lvl;
+	u32 pte[NVKM_VMM_LEVELS_MAX];
+	struct nvkm_vmm_pt *pt[NVKM_VMM_LEVELS_MAX];
+	int flush;
+};
+
+#ifdef CONFIG_NOUVEAU_DEBUG_MMU
+static const char *
+nvkm_vmm_desc_type(const struct nvkm_vmm_desc *desc)
+{
+	switch (desc->type) {
+	case PGD: return "PGD";
+	case PGT: return "PGT";
+	case SPT: return "SPT";
+	case LPT: return "LPT";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void
+nvkm_vmm_trace(struct nvkm_vmm_iter *it, char *buf)
+{
+	int lvl;
+	for (lvl = it->max; lvl >= 0; lvl--) {
+		if (lvl >= it->lvl)
+			buf += sprintf(buf,  "%05x:", it->pte[lvl]);
+		else
+			buf += sprintf(buf, "xxxxx:");
+	}
+}
+
+#define TRA(i,f,a...) do {                                                     \
+	char _buf[NVKM_VMM_LEVELS_MAX * 7];                                    \
+	struct nvkm_vmm_iter *_it = (i);                                       \
+	nvkm_vmm_trace(_it, _buf);                                             \
+	VMM_TRACE(_it->vmm, "%s "f, _buf, ##a);                                \
+} while(0)
+#else
+#define TRA(i,f,a...)
+#endif
+
+static inline void
+nvkm_vmm_flush_mark(struct nvkm_vmm_iter *it)
+{
+	it->flush = min(it->flush, it->max - it->lvl);
+}
+
+static inline void
+nvkm_vmm_flush(struct nvkm_vmm_iter *it)
+{
+	if (it->flush != NVKM_VMM_LEVELS_MAX) {
+		if (it->vmm->func->flush) {
+			TRA(it, "flush: %d", it->flush);
+			it->vmm->func->flush(it->vmm, it->flush);
+		}
+		it->flush = NVKM_VMM_LEVELS_MAX;
+	}
+}
+
+static void
+nvkm_vmm_unref_pdes(struct nvkm_vmm_iter *it)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc[it->lvl].type == SPT;
+	struct nvkm_vmm_pt *pgd = it->pt[it->lvl + 1];
+	struct nvkm_vmm_pt *pgt = it->pt[it->lvl];
+	struct nvkm_mmu_pt *pt = pgt->pt[type];
+	struct nvkm_vmm *vmm = it->vmm;
+	u32 pdei = it->pte[it->lvl + 1];
+
+	/* Recurse up the tree, unreferencing/destroying unneeded PDs. */
+	it->lvl++;
+	if (--pgd->refs[0]) {
+		const struct nvkm_vmm_desc_func *func = desc[it->lvl].func;
+		/* PD has other valid PDEs, so we need a proper update. */
+		TRA(it, "PDE unmap %s", nvkm_vmm_desc_type(&desc[it->lvl - 1]));
+		pgt->pt[type] = NULL;
+		if (!pgt->refs[!type]) {
+			/* PDE no longer required. */
+			if (pgd->pt[0]) {
+				if (pgt->sparse) {
+					func->sparse(vmm, pgd->pt[0], pdei, 1);
+					pgd->pde[pdei] = NVKM_VMM_PDE_SPARSE;
+				} else {
+					func->unmap(vmm, pgd->pt[0], pdei, 1);
+					pgd->pde[pdei] = NULL;
+				}
+			} else {
+				/* Special handling for Tesla-class GPUs,
+				 * where there's no central PD, but each
+				 * instance has its own embedded PD.
+				 */
+				func->pde(vmm, pgd, pdei);
+				pgd->pde[pdei] = NULL;
+			}
+		} else {
+			/* PDE was pointing at dual-PTs and we're removing
+			 * one of them, leaving the other in place.
+			 */
+			func->pde(vmm, pgd, pdei);
+		}
+
+		/* GPU may have cached the PTs, flush before freeing. */
+		nvkm_vmm_flush_mark(it);
+		nvkm_vmm_flush(it);
+	} else {
+		/* PD has no valid PDEs left, so we can just destroy it. */
+		nvkm_vmm_unref_pdes(it);
+	}
+
+	/* Destroy PD/PT. */
+	TRA(it, "PDE free %s", nvkm_vmm_desc_type(&desc[it->lvl - 1]));
+	nvkm_mmu_ptc_put(vmm->mmu, vmm->bootstrapped, &pt);
+	if (!pgt->refs[!type])
+		nvkm_vmm_pt_del(&pgt);
+	it->lvl--;
+}
+
+static void
+nvkm_vmm_unref_sptes(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgt,
+		     const struct nvkm_vmm_desc *desc, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *pair = it->page[-1].desc;
+	const u32 sptb = desc->bits - pair->bits;
+	const u32 sptn = 1 << sptb;
+	struct nvkm_vmm *vmm = it->vmm;
+	u32 spti = ptei & (sptn - 1), lpti, pteb;
+
+	/* Determine how many SPTEs are being touched under each LPTE,
+	 * and drop reference counts.
+	 */
+	for (lpti = ptei >> sptb; ptes; spti = 0, lpti++) {
+		const u32 pten = min(sptn - spti, ptes);
+		pgt->pte[lpti] -= pten;
+		ptes -= pten;
+	}
+
+	/* We're done here if there's no corresponding LPT. */
+	if (!pgt->refs[0])
+		return;
+
+	for (ptei = pteb = ptei >> sptb; ptei < lpti; pteb = ptei) {
+		/* Skip over any LPTEs that still have valid SPTEs. */
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_SPTES) {
+			for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+				if (!(pgt->pte[ptei] & NVKM_VMM_PTE_SPTES))
+					break;
+			}
+			continue;
+		}
+
+		/* As there's no more non-UNMAPPED SPTEs left in the range
+		 * covered by a number of LPTEs, the LPTEs once again take
+		 * control over their address range.
+		 *
+		 * Determine how many LPTEs need to transition state.
+		 */
+		pgt->pte[ptei] &= ~NVKM_VMM_PTE_VALID;
+		for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+			if (pgt->pte[ptei] & NVKM_VMM_PTE_SPTES)
+				break;
+			pgt->pte[ptei] &= ~NVKM_VMM_PTE_VALID;
+		}
+
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_SPARSE) {
+			TRA(it, "LPTE %05x: U -> S %d PTEs", pteb, ptes);
+			pair->func->sparse(vmm, pgt->pt[0], pteb, ptes);
+		} else
+		if (pair->func->invalid) {
+			/* If the MMU supports it, restore the LPTE to the
+			 * INVALID state to tell the MMU there is no point
+			 * trying to fetch the corresponding SPTEs.
+			 */
+			TRA(it, "LPTE %05x: U -> I %d PTEs", pteb, ptes);
+			pair->func->invalid(vmm, pgt->pt[0], pteb, ptes);
+		}
+	}
+}
+
+static bool
+nvkm_vmm_unref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc->type == SPT;
+	struct nvkm_vmm_pt *pgt = it->pt[0];
+
+	/* Drop PTE references. */
+	pgt->refs[type] -= ptes;
+
+	/* Dual-PTs need special handling, unless PDE becoming invalid. */
+	if (desc->type == SPT && (pgt->refs[0] || pgt->refs[1]))
+		nvkm_vmm_unref_sptes(it, pgt, desc, ptei, ptes);
+
+	/* PT no longer neeed?  Destroy it. */
+	if (!pgt->refs[type]) {
+		it->lvl++;
+		TRA(it, "%s empty", nvkm_vmm_desc_type(desc));
+		it->lvl--;
+		nvkm_vmm_unref_pdes(it);
+		return false; /* PTE writes for unmap() not necessary. */
+	}
+
+	return true;
+}
+
+static void
+nvkm_vmm_ref_sptes(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgt,
+		   const struct nvkm_vmm_desc *desc, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *pair = it->page[-1].desc;
+	const u32 sptb = desc->bits - pair->bits;
+	const u32 sptn = 1 << sptb;
+	struct nvkm_vmm *vmm = it->vmm;
+	u32 spti = ptei & (sptn - 1), lpti, pteb;
+
+	/* Determine how many SPTEs are being touched under each LPTE,
+	 * and increase reference counts.
+	 */
+	for (lpti = ptei >> sptb; ptes; spti = 0, lpti++) {
+		const u32 pten = min(sptn - spti, ptes);
+		pgt->pte[lpti] += pten;
+		ptes -= pten;
+	}
+
+	/* We're done here if there's no corresponding LPT. */
+	if (!pgt->refs[0])
+		return;
+
+	for (ptei = pteb = ptei >> sptb; ptei < lpti; pteb = ptei) {
+		/* Skip over any LPTEs that already have valid SPTEs. */
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_VALID) {
+			for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+				if (!(pgt->pte[ptei] & NVKM_VMM_PTE_VALID))
+					break;
+			}
+			continue;
+		}
+
+		/* As there are now non-UNMAPPED SPTEs in the range covered
+		 * by a number of LPTEs, we need to transfer control of the
+		 * address range to the SPTEs.
+		 *
+		 * Determine how many LPTEs need to transition state.
+		 */
+		pgt->pte[ptei] |= NVKM_VMM_PTE_VALID;
+		for (ptes = 1, ptei++; ptei < lpti; ptes++, ptei++) {
+			if (pgt->pte[ptei] & NVKM_VMM_PTE_VALID)
+				break;
+			pgt->pte[ptei] |= NVKM_VMM_PTE_VALID;
+		}
+
+		if (pgt->pte[pteb] & NVKM_VMM_PTE_SPARSE) {
+			const u32 spti = pteb * sptn;
+			const u32 sptc = ptes * sptn;
+			/* The entire LPTE is marked as sparse, we need
+			 * to make sure that the SPTEs are too.
+			 */
+			TRA(it, "SPTE %05x: U -> S %d PTEs", spti, sptc);
+			desc->func->sparse(vmm, pgt->pt[1], spti, sptc);
+			/* Sparse LPTEs prevent SPTEs from being accessed. */
+			TRA(it, "LPTE %05x: S -> U %d PTEs", pteb, ptes);
+			pair->func->unmap(vmm, pgt->pt[0], pteb, ptes);
+		} else
+		if (pair->func->invalid) {
+			/* MMU supports blocking SPTEs by marking an LPTE
+			 * as INVALID.  We need to reverse that here.
+			 */
+			TRA(it, "LPTE %05x: I -> U %d PTEs", pteb, ptes);
+			pair->func->unmap(vmm, pgt->pt[0], pteb, ptes);
+		}
+	}
+}
+
+static bool
+nvkm_vmm_ref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc->type == SPT;
+	struct nvkm_vmm_pt *pgt = it->pt[0];
+
+	/* Take PTE references. */
+	pgt->refs[type] += ptes;
+
+	/* Dual-PTs need special handling. */
+	if (desc->type == SPT)
+		nvkm_vmm_ref_sptes(it, pgt, desc, ptei, ptes);
+
+	return true;
+}
+
+static void
+nvkm_vmm_sparse_ptes(const struct nvkm_vmm_desc *desc,
+		     struct nvkm_vmm_pt *pgt, u32 ptei, u32 ptes)
+{
+	if (desc->type == PGD) {
+		while (ptes--)
+			pgt->pde[ptei++] = NVKM_VMM_PDE_SPARSE;
+	} else
+	if (desc->type == LPT) {
+		memset(&pgt->pte[ptei], NVKM_VMM_PTE_SPARSE, ptes);
+	}
+}
+
+static bool
+nvkm_vmm_ref_hwpt(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	const struct nvkm_vmm_desc *desc = &it->desc[it->lvl - 1];
+	const int type = desc->type == SPT;
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+	const bool zero = !pgt->sparse && !desc->func->invalid;
+	struct nvkm_vmm *vmm = it->vmm;
+	struct nvkm_mmu *mmu = vmm->mmu;
+	struct nvkm_mmu_pt *pt;
+	u32 pten = 1 << desc->bits;
+	u32 pteb, ptei, ptes;
+	u32 size = desc->size * pten;
+
+	pgd->refs[0]++;
+
+	pgt->pt[type] = nvkm_mmu_ptc_get(mmu, size, desc->align, zero);
+	if (!pgt->pt[type]) {
+		it->lvl--;
+		nvkm_vmm_unref_pdes(it);
+		return false;
+	}
+
+	if (zero)
+		goto done;
+
+	pt = pgt->pt[type];
+
+	if (desc->type == LPT && pgt->refs[1]) {
+		/* SPT already exists covering the same range as this LPT,
+		 * which means we need to be careful that any LPTEs which
+		 * overlap valid SPTEs are unmapped as opposed to invalid
+		 * or sparse, which would prevent the MMU from looking at
+		 * the SPTEs on some GPUs.
+		 */
+		for (ptei = pteb = 0; ptei < pten; pteb = ptei) {
+			bool spte = pgt->pte[ptei] & NVKM_VMM_PTE_SPTES;
+			for (ptes = 1, ptei++; ptei < pten; ptes++, ptei++) {
+				bool next = pgt->pte[ptei] & NVKM_VMM_PTE_SPTES;
+				if (spte != next)
+					break;
+			}
+
+			if (!spte) {
+				if (pgt->sparse)
+					desc->func->sparse(vmm, pt, pteb, ptes);
+				else
+					desc->func->invalid(vmm, pt, pteb, ptes);
+				memset(&pgt->pte[pteb], 0x00, ptes);
+			} else {
+				desc->func->unmap(vmm, pt, pteb, ptes);
+				while (ptes--)
+					pgt->pte[pteb++] |= NVKM_VMM_PTE_VALID;
+			}
+		}
+	} else {
+		if (pgt->sparse) {
+			nvkm_vmm_sparse_ptes(desc, pgt, 0, pten);
+			desc->func->sparse(vmm, pt, 0, pten);
+		} else {
+			desc->func->invalid(vmm, pt, 0, pten);
+		}
+	}
+
+done:
+	TRA(it, "PDE write %s", nvkm_vmm_desc_type(desc));
+	it->desc[it->lvl].func->pde(it->vmm, pgd, pdei);
+	nvkm_vmm_flush_mark(it);
+	return true;
+}
+
+static bool
+nvkm_vmm_ref_swpt(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgd, u32 pdei)
+{
+	const struct nvkm_vmm_desc *desc = &it->desc[it->lvl - 1];
+	struct nvkm_vmm_pt *pgt = pgd->pde[pdei];
+
+	pgt = nvkm_vmm_pt_new(desc, NVKM_VMM_PDE_SPARSED(pgt), it->page);
+	if (!pgt) {
+		if (!pgd->refs[0])
+			nvkm_vmm_unref_pdes(it);
+		return false;
+	}
+
+	pgd->pde[pdei] = pgt;
+	return true;
+}
+
+static inline u64
+nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+	      u64 addr, u64 size, const char *name, bool ref,
+	      bool (*REF_PTES)(struct nvkm_vmm_iter *, u32, u32),
+	      nvkm_vmm_pte_func MAP_PTES, struct nvkm_vmm_map *map,
+	      nvkm_vmm_pxe_func CLR_PTES)
+{
+	const struct nvkm_vmm_desc *desc = page->desc;
+	struct nvkm_vmm_iter it;
+	u64 bits = addr >> page->shift;
+
+	it.page = page;
+	it.desc = desc;
+	it.vmm = vmm;
+	it.cnt = size >> page->shift;
+	it.flush = NVKM_VMM_LEVELS_MAX;
+
+	/* Deconstruct address into PTE indices for each mapping level. */
+	for (it.lvl = 0; desc[it.lvl].bits; it.lvl++) {
+		it.pte[it.lvl] = bits & ((1 << desc[it.lvl].bits) - 1);
+		bits >>= desc[it.lvl].bits;
+	}
+	it.max = --it.lvl;
+	it.pt[it.max] = vmm->pd;
+
+	it.lvl = 0;
+	TRA(&it, "%s: %016llx %016llx %d %lld PTEs", name,
+	         addr, size, page->shift, it.cnt);
+	it.lvl = it.max;
+
+	/* Depth-first traversal of page tables. */
+	while (it.cnt) {
+		struct nvkm_vmm_pt *pgt = it.pt[it.lvl];
+		const int type = desc->type == SPT;
+		const u32 pten = 1 << desc->bits;
+		const u32 ptei = it.pte[0];
+		const u32 ptes = min_t(u64, it.cnt, pten - ptei);
+
+		/* Walk down the tree, finding page tables for each level. */
+		for (; it.lvl; it.lvl--) {
+			const u32 pdei = it.pte[it.lvl];
+			struct nvkm_vmm_pt *pgd = pgt;
+
+			/* Software PT. */
+			if (ref && NVKM_VMM_PDE_INVALID(pgd->pde[pdei])) {
+				if (!nvkm_vmm_ref_swpt(&it, pgd, pdei))
+					goto fail;
+			}
+			it.pt[it.lvl - 1] = pgt = pgd->pde[pdei];
+
+			/* Hardware PT.
+			 *
+			 * This is a separate step from above due to GF100 and
+			 * newer having dual page tables at some levels, which
+			 * are refcounted independently.
+			 */
+			if (ref && !pgt->refs[desc[it.lvl - 1].type == SPT]) {
+				if (!nvkm_vmm_ref_hwpt(&it, pgd, pdei))
+					goto fail;
+			}
+		}
+
+		/* Handle PTE updates. */
+		if (!REF_PTES || REF_PTES(&it, ptei, ptes)) {
+			struct nvkm_mmu_pt *pt = pgt->pt[type];
+			if (MAP_PTES || CLR_PTES) {
+				if (MAP_PTES)
+					MAP_PTES(vmm, pt, ptei, ptes, map);
+				else
+					CLR_PTES(vmm, pt, ptei, ptes);
+				nvkm_vmm_flush_mark(&it);
+			}
+		}
+
+		/* Walk back up the tree to the next position. */
+		it.pte[it.lvl] += ptes;
+		it.cnt -= ptes;
+		if (it.cnt) {
+			while (it.pte[it.lvl] == (1 << desc[it.lvl].bits)) {
+				it.pte[it.lvl++] = 0;
+				it.pte[it.lvl]++;
+			}
+		}
+	};
+
+	nvkm_vmm_flush(&it);
+	return ~0ULL;
+
+fail:
+	/* Reconstruct the failure address so the caller is able to
+	 * reverse any partially completed operations.
+	 */
+	addr = it.pte[it.max--];
+	do {
+		addr  = addr << desc[it.max].bits;
+		addr |= it.pte[it.max];
+	} while (it.max--);
+
+	return addr << page->shift;
+}
+
+void
+nvkm_vmm_ptes_unmap(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		    u64 addr, u64 size, bool sparse)
+{
+	const struct nvkm_vmm_desc_func *func = page->desc->func;
+	nvkm_vmm_iter(vmm, page, addr, size, "unmap", false, NULL, NULL, NULL,
+		      sparse ? func->sparse : func->invalid ? func->invalid :
+							      func->unmap);
+}
+
+void
+nvkm_vmm_ptes_map(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		  u64 addr, u64 size, struct nvkm_vmm_map *map,
+		  nvkm_vmm_pte_func func)
+{
+	nvkm_vmm_iter(vmm, page, addr, size, "map", false,
+		      NULL, func, map, NULL);
+}
+
+void
+nvkm_vmm_ptes_put(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		  u64 addr, u64 size)
+{
+	nvkm_vmm_iter(vmm, page, addr, size, "unref", false,
+		      nvkm_vmm_unref_ptes, NULL, NULL, NULL);
+}
+
+int
+nvkm_vmm_ptes_get(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page,
+		  u64 addr, u64 size)
+{
+	u64 fail = nvkm_vmm_iter(vmm, page, addr, size, "ref", true,
+				 nvkm_vmm_ref_ptes, NULL, NULL, NULL);
+	if (fail != ~0ULL) {
+		if (fail != addr)
+			nvkm_vmm_ptes_put(vmm, page, addr, fail - addr);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 void
 nvkm_vmm_dtor(struct nvkm_vmm *vmm)
 {
+	if (vmm->bootstrapped) {
+		const struct nvkm_vmm_page *page = vmm->func->page;
+		const u64 limit = vmm->limit - vmm->start;
+
+		while (page[1].shift)
+			page++;
+
+		nvkm_mmu_ptc_dump(vmm->mmu);
+		nvkm_vmm_ptes_put(vmm, page, vmm->start, limit);
+	}
+
 	if (vmm->nullp) {
 		dma_free_coherent(vmm->mmu->subdev.device->dev, 16 * 1024,
 				  vmm->nullp, vmm->null);
@@ -94,6 +644,7 @@ nvkm_vmm_ctor(const struct nvkm_vmm_func *func, struct nvkm_mmu *mmu,
 	vmm->func = func;
 	vmm->mmu = mmu;
 	vmm->name = name;
+	vmm->debug = mmu->subdev.debug;
 	kref_init(&vmm->kref);

 	__mutex_init(&vmm->mutex, "&vmm->mutex", key ? key : &_key);
@@ -150,3 +701,32 @@ nvkm_vmm_new_(const struct nvkm_vmm_func *func, struct nvkm_mmu *mmu,
 		return -ENOMEM;
 	return nvkm_vmm_ctor(func, mmu, hdr, addr, size, key, name, *pvmm);
 }
+
+static bool
+nvkm_vmm_boot_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes)
+{
+	const struct nvkm_vmm_desc *desc = it->desc;
+	const int type = desc->type == SPT;
+	nvkm_memory_boot(it->pt[0]->pt[type]->memory, it->vmm);
+	return false;
+}
+
+int
+nvkm_vmm_boot(struct nvkm_vmm *vmm)
+{
+	const struct nvkm_vmm_page *page = vmm->func->page;
+	const u64 limit = vmm->limit - vmm->start;
+	int ret;
+
+	while (page[1].shift)
+		page++;
+
+	ret = nvkm_vmm_ptes_get(vmm, page, vmm->start, limit);
+	if (ret)
+		return ret;
+
+	nvkm_vmm_iter(vmm, page, vmm->start, limit, "bootstrap", false,
+		      nvkm_vmm_boot_ptes, NULL, NULL, NULL);
+	vmm->bootstrapped = true;
+	return 0;
+}
--- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
+++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h
@@ -2,6 +2,7 @@
 #define __NVKM_VMM_H__
 #include "priv.h"
 #include <core/memory.h>
+enum nvkm_memory_target;

 struct nvkm_vmm_pt {
 	/* Some GPUs have a mapping level with a dual page tables to
@@ -49,7 +50,23 @@ struct nvkm_vmm_pt {
 	u8 pte[];
 };

+typedef void (*nvkm_vmm_pxe_func)(struct nvkm_vmm *,
+				  struct nvkm_mmu_pt *, u32 ptei, u32 ptes);
+typedef void (*nvkm_vmm_pde_func)(struct nvkm_vmm *,
+				  struct nvkm_vmm_pt *, u32 pdei);
+typedef void (*nvkm_vmm_pte_func)(struct nvkm_vmm *, struct nvkm_mmu_pt *,
+				  u32 ptei, u32 ptes, struct nvkm_vmm_map *);
+
 struct nvkm_vmm_desc_func {
+	nvkm_vmm_pxe_func invalid;
+	nvkm_vmm_pxe_func unmap;
+	nvkm_vmm_pxe_func sparse;
+
+	nvkm_vmm_pde_func pde;
+
+	nvkm_vmm_pte_func mem;
+	nvkm_vmm_pte_func dma;
+	nvkm_vmm_pte_func sgl;
 };

 extern const struct nvkm_vmm_desc_func gf100_vmm_pgd;
@@ -106,6 +123,11 @@ struct nvkm_vmm_func {
 	int (*join)(struct nvkm_vmm *, struct nvkm_memory *inst);
 	void (*part)(struct nvkm_vmm *, struct nvkm_memory *inst);

+	int (*aper)(enum nvkm_memory_target);
+	int (*valid)(struct nvkm_vmm *, void *argv, u32 argc,
+		     struct nvkm_vmm_map *);
+	void (*flush)(struct nvkm_vmm *, int depth);
+
 	u64 page_block;
 	const struct nvkm_vmm_page page[];
 };
@@ -122,6 +144,15 @@ int nvkm_vmm_ctor(const struct nvkm_vmm_func *, struct nvkm_mmu *,
 		  u32 pd_header, u64 addr, u64 size, struct lock_class_key *,
 		  const char *name, struct nvkm_vmm *);
 void nvkm_vmm_dtor(struct nvkm_vmm *);
+void nvkm_vmm_ptes_put(struct nvkm_vmm *, const struct nvkm_vmm_page *,
+		       u64 addr, u64 size);
+int nvkm_vmm_ptes_get(struct nvkm_vmm *, const struct nvkm_vmm_page *,
+		      u64 addr, u64 size);
+void nvkm_vmm_ptes_map(struct nvkm_vmm *, const struct nvkm_vmm_page *,
+		       u64 addr, u64 size, struct nvkm_vmm_map *,
+		       nvkm_vmm_pte_func);
+void nvkm_vmm_ptes_unmap(struct nvkm_vmm *, const struct nvkm_vmm_page *,
+			 u64 addr, u64 size, bool sparse);

 int nv04_vmm_new_(const struct nvkm_vmm_func *, struct nvkm_mmu *, u32,
 		  u64, u64, void *, u32, struct lock_class_key *,
@@ -176,4 +207,85 @@ int gp100_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
 int gp10b_vmm_new(struct nvkm_mmu *, u64, u64, void *, u32,
 		  struct lock_class_key *, const char *,
 		  struct nvkm_vmm **);
+
+#define VMM_PRINT(l,v,p,f,a...) do {                                           \
+	struct nvkm_vmm *_vmm = (v);                                           \
+	if (CONFIG_NOUVEAU_DEBUG >= (l) && _vmm->debug >= (l)) {               \
+		nvkm_printk_(&_vmm->mmu->subdev, 0, p, "%s: "f"\n",            \
+			     _vmm->name, ##a);                                 \
+	}                                                                      \
+} while(0)
+#define VMM_DEBUG(v,f,a...) VMM_PRINT(NV_DBG_DEBUG, (v), info, f, ##a)
+#define VMM_TRACE(v,f,a...) VMM_PRINT(NV_DBG_TRACE, (v), info, f, ##a)
+#define VMM_SPAM(v,f,a...)  VMM_PRINT(NV_DBG_SPAM , (v),  dbg, f, ##a)
+
+#define VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,BASE,SIZE,NEXT) do {            \
+	nvkm_kmap((PT)->memory);                                               \
+	while (PTEN) {                                                         \
+		u64 _ptes = ((SIZE) - MAP->off) >> MAP->page->shift;           \
+		u64 _addr = ((BASE) + MAP->off);                               \
+                                                                               \
+		if (_ptes > PTEN) {                                            \
+			MAP->off += PTEN << MAP->page->shift;                  \
+			_ptes = PTEN;                                          \
+		} else {                                                       \
+			MAP->off = 0;                                          \
+			NEXT;                                                  \
+		}                                                              \
+                                                                               \
+		VMM_SPAM(VMM, "ITER %08x %08x PTE(s)", PTEI, (u32)_ptes);      \
+                                                                               \
+		FILL(VMM, PT, PTEI, _ptes, MAP, _addr);                        \
+		PTEI += _ptes;                                                 \
+		PTEN -= _ptes;                                                 \
+	};                                                                     \
+	nvkm_done((PT)->memory);                                               \
+} while(0)
+
+#define VMM_MAP_ITER_MEM(VMM,PT,PTEI,PTEN,MAP,FILL)                            \
+	VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,                                \
+		     ((u64)MAP->mem->offset << NVKM_RAM_MM_SHIFT),             \
+		     ((u64)MAP->mem->length << NVKM_RAM_MM_SHIFT),             \
+		     (MAP->mem = MAP->mem->next))
+#define VMM_MAP_ITER_DMA(VMM,PT,PTEI,PTEN,MAP,FILL)                            \
+	VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,                                \
+		     *MAP->dma, PAGE_SIZE, MAP->dma++)
+#define VMM_MAP_ITER_SGL(VMM,PT,PTEI,PTEN,MAP,FILL)                            \
+	VMM_MAP_ITER(VMM,PT,PTEI,PTEN,MAP,FILL,                                \
+		     sg_dma_address(MAP->sgl), sg_dma_len(MAP->sgl),           \
+		     (MAP->sgl = sg_next(MAP->sgl)))
+
+#define VMM_FO(m,o,d,c,b) nvkm_fo##b((m)->memory, (o), (d), (c))
+#define VMM_WO(m,o,d,c,b) nvkm_wo##b((m)->memory, (o), (d))
+#define VMM_XO(m,v,o,d,c,b,fn,f,a...) do {                                     \
+	const u32 _pteo = (o); u##b _data = (d);                               \
+	VMM_SPAM((v), "   %010llx "f, (m)->addr + _pteo, _data, ##a);          \
+	VMM_##fn((m), (m)->base + _pteo, _data, (c), b);                       \
+} while(0)
+
+#define VMM_WO032(m,v,o,d) VMM_XO((m),(v),(o),(d),  1, 32, WO, "%08x")
+#define VMM_FO032(m,v,o,d,c)                                                   \
+	VMM_XO((m),(v),(o),(d),(c), 32, FO, "%08x %08x", (c))
+
+#define VMM_WO064(m,v,o,d) VMM_XO((m),(v),(o),(d),  1, 64, WO, "%016llx")
+#define VMM_FO064(m,v,o,d,c)                                                   \
+	VMM_XO((m),(v),(o),(d),(c), 64, FO, "%016llx %08x", (c))
+
+#define VMM_XO128(m,v,o,lo,hi,c,f,a...) do {                                   \
+	u32 _pteo = (o), _ptes = (c);                                          \
+	const u64 _addr = (m)->addr + _pteo;                                   \
+	VMM_SPAM((v), "   %010llx %016llx%016llx"f, _addr, (hi), (lo), ##a);   \
+	while (_ptes--) {                                                      \
+		nvkm_wo64((m)->memory, (m)->base + _pteo + 0, (lo));           \
+		nvkm_wo64((m)->memory, (m)->base + _pteo + 8, (hi));           \
+		_pteo += 0x10;                                                 \
+	}                                                                      \
+} while(0)
+
+#define VMM_WO128(m,v,o,lo,hi) VMM_XO128((m),(v),(o),(lo),(hi), 1, "")
+#define VMM_FO128(m,v,o,lo,hi,c) do {                                          \
+	nvkm_kmap((m)->memory);                                                \
+	VMM_XO128((m),(v),(o),(lo),(hi),(c), " %08x", (c));                    \
+	nvkm_done((m)->memory);                                                \
+} while(0)
 #endif