vfio: hugepage support for vfio_iommu_type1

We currently send all mappings to the iommu in PAGE_SIZE chunks, which prevents the iommu from enabling support for larger page sizes. We still need to pin pages, which means we step through them in PAGE_SIZE chunks, but we can batch up contiguous physical memory chunks to allow the iommu the opportunity to use larger pages. The approach here is a bit different that the one currently used for legacy KVM device assignment. Rather than looking at the vma page size and using that as the maximum size to pass to the iommu, we instead simply look at whether the next page is physically contiguous. This means we might ask the iommu to map a 4MB region, while legacy KVM might limit itself to a maximum of 2MB. Splitting our mapping path also allows us to be smarter about locked memory because we can more easily unwind if the user attempts to exceed the limit. Therefore, rather than assuming that a mapping will result in locked memory, we test each page as it is pinned to determine whether it locks RAM vs an mmap'd MMIO region. This should result in better locking granularity and less locked page fudge factors in userspace. The unmap path uses the same algorithm as legacy KVM. We don't want to track the pfn for each mapping ourselves, but we need the pfn in order to unpin pages. We therefore ask the iommu for the iova to physical address translation, ask it to unpin a page, and see how many pages were actually unpinned. iommus supporting large pages will often return something bigger than a page here, which we know will be physically contiguous and we can unpin a batch of pfns. iommus not supporting large mappings won't see an improvement in batching here as they only unmap a page at a time. With this change, we also make a clarification to the API for mapping and unmapping DMA. We can only guarantee unmaps at the same granularity as used for the original mapping. In other words, unmapping a subregion of a previous mapping is not guaranteed and may result in a larger or smaller unmapping than requested. The size field in the unmapping structure is updated to reflect this. Previously this was unmodified on mapping, always returning the the requested unmap size. This is now updated to return the actual unmap size on success, allowing userspace to appropriately track mappings. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>

vfio: hugepage support for vfio_iommu_type1
We currently send all mappings to the iommu in PAGE_SIZE chunks, which prevents the iommu from enabling support for larger page sizes. We still need to pin pages, which means we step through them in PAGE_SIZE chunks, but we can batch up contiguous physical memory chunks to allow the iommu the opportunity to use larger pages. The approach here is a bit different that the one currently used for legacy KVM device assignment. Rather than looking at the vma page size and using that as the maximum size to pass to the iommu, we instead simply look at whether the next page is physically contiguous. This means we might ask the iommu to map a 4MB region, while legacy KVM might limit itself to a maximum of 2MB. Splitting our mapping path also allows us to be smarter about locked memory because we can more easily unwind if the user attempts to exceed the limit. Therefore, rather than assuming that a mapping will result in locked memory, we test each page as it is pinned to determine whether it locks RAM vs an mmap'd MMIO region. This should result in better locking granularity and less locked page fudge factors in userspace. The unmap path uses the same algorithm as legacy KVM. We don't want to track the pfn for each mapping ourselves, but we need the pfn in order to unpin pages. We therefore ask the iommu for the iova to physical address translation, ask it to unpin a page, and see how many pages were actually unpinned. iommus supporting large pages will often return something bigger than a page here, which we know will be physically contiguous and we can unpin a batch of pfns. iommus not supporting large mappings won't see an improvement in batching here as they only unmap a page at a time. With this change, we also make a clarification to the API for mapping and unmapping DMA. We can only guarantee unmaps at the same granularity as used for the original mapping. In other words, unmapping a subregion of a previous mapping is not guaranteed and may result in a larger or smaller unmapping than requested. The size field in the unmapping structure is updated to reflect this. Previously this was unmodified on mapping, always returning the the requested unmap size. This is now updated to return the actual unmap size on success, allowing userspace to appropriately track mappings. Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
166fd7d9 · Alex Williamson · cd9b2268 · 166fd7d9 · 166fd7d9
Commit 166fd7d9 authored Jun 21, 2013 by Alex Williamson
Show whitespace changes
Inline Side-by-side

Showing with 344 additions and 187 deletions

drivers/vfio/vfio_iommu_type1.c drivers/vfio/vfio_iommu_type1.c +338 -185

include/uapi/linux/vfio.h include/uapi/linux/vfio.h +6 -2

No files found.
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,7 +60,7 @@ struct vfio_dma {
 	struct rb_node		node;
 	dma_addr_t		iova;		/* Device address */
 	unsigned long		vaddr;		/* Process virtual addr */
-	long			npage;		/* Number of pages */
+	size_t			size;		/* Map size (bytes) */
 	int			prot;		/* IOMMU_READ/WRITE */
 };

@@ -74,8 +74,6 @@ struct vfio_group {
 * into DMA'ble space using the IOMMU
 */

-#define NPAGE_TO_SIZE(npage)	((size_t)(npage) << PAGE_SHIFT)
-
 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 				      dma_addr_t start, size_t size)
 {
@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,

 		if (start + size <= dma->iova)
 			node = node->rb_left;
-		else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
+		else if (start >= dma->iova + dma->size)
 			node = node->rb_right;
 		else
 			return dma;
@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 		parent = *link;
 		dma = rb_entry(parent, struct vfio_dma, node);

-		if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
+		if (new->iova + new->size <= dma->iova)
 			link = &(*link)->rb_left;
 		else
 			link = &(*link)->rb_right;
@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
 	struct vwork *vwork;
 	struct mm_struct *mm;

-	if (!current->mm)
-		return; /* process exited */
+	if (!current->mm || !npage)
+		return; /* process exited or nothing to do */

 	if (down_write_trylock(&current->mm->mmap_sem)) {
 		current->mm->locked_vm += npage;
@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
 	return 0;
 }

-/* Unmap DMA region */
-static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-			     long npage, int prot)
-{
-	long i, unlocked = 0;
-
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
-		unsigned long pfn;
-
-		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
-		if (pfn) {
-			iommu_unmap(iommu->domain, iova, PAGE_SIZE);
-			unlocked += put_pfn(pfn, prot);
-		}
-	}
-	return unlocked;
-}
-
-static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
-			   long npage, int prot)
-{
-	long unlocked;
-
-	unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
-	vfio_lock_acct(-unlocked);
-}
-
 static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 {
 	struct page *page[1];
@@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
 	return ret;
 }

-/* Map DMA region */
-static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
-			  unsigned long vaddr, long npage, int prot)
+/*
+ * Attempt to pin pages.  We really don't want to track all the pfns and
+ * the iommu can only map chunks of consecutive pfns anyway, so get the
+ * first page and all consecutive pages with the same locking.
+ */
+static long vfio_pin_pages(unsigned long vaddr, long npage,
+			   int prot, unsigned long *pfn_base)
 {
-	dma_addr_t start = iova;
-	long i, locked = 0;
-	int ret;
+	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	bool lock_cap = capable(CAP_IPC_LOCK);
+	long ret, i;
+
+	if (!current->mm)
+		return -ENODEV;

-	/* Verify that pages are not already mapped */
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE)
-		if (iommu_iova_to_phys(iommu->domain, iova))
-			return -EBUSY;
+	ret = vaddr_get_pfn(vaddr, prot, pfn_base);
+	if (ret)
+		return ret;

-	iova = start;
+	if (is_invalid_reserved_pfn(*pfn_base))
+		return 1;

-	if (iommu->cache)
-		prot |= IOMMU_CACHE;
+	if (!lock_cap && current->mm->locked_vm + 1 > limit) {
+		put_pfn(*pfn_base, prot);
+		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
+			limit << PAGE_SHIFT);
+		return -ENOMEM;
+	}

-	/*
-	 * XXX We break mappings into pages and use get_user_pages_fast to
-	 * pin the pages in memory.  It's been suggested that mlock might
-	 * provide a more efficient mechanism, but nothing prevents the
-	 * user from munlocking the pages, which could then allow the user
-	 * access to random host memory.  We also have no guarantee from the
-	 * IOMMU API that the iommu driver can unmap sub-pages of previous
-	 * mappings.  This means we might lose an entire range if a single
-	 * page within it is unmapped.  Single page mappings are inefficient,
-	 * but provide the most flexibility for now.
-	 */
-	for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
+	/* Lock all the consecutive pages from pfn_base */
+	for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
 		unsigned long pfn = 0;

 		ret = vaddr_get_pfn(vaddr, prot, &pfn);
-		if (ret) {
-			__vfio_dma_do_unmap(iommu, start, i, prot);
-			return ret;
+		if (ret)
+			break;
+
+		if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
+			put_pfn(pfn, prot);
+			break;
+		}
+
+		if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
+			put_pfn(pfn, prot);
+			pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
+				__func__, limit << PAGE_SHIFT);
+			break;
+		}
 	}

+	vfio_lock_acct(i);
+
+	return i;
+}
+
+static long vfio_unpin_pages(unsigned long pfn, long npage,
+			     int prot, bool do_accounting)
+{
+	unsigned long unlocked = 0;
+	long i;
+
+	for (i = 0; i < npage; i++)
+		unlocked += put_pfn(pfn++, prot);
+
+	if (do_accounting)
+		vfio_lock_acct(-unlocked);
+
+	return unlocked;
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    dma_addr_t iova, size_t *size)
+{
+	dma_addr_t start = iova, end = iova + *size;
+	long unlocked = 0;
+
+	while (iova < end) {
+		size_t unmapped;
+		phys_addr_t phys;
+
 		/*
-		 * Only add actual locked pages to accounting
-		 * XXX We're effectively marking a page locked for every
-		 * IOVA page even though it's possible the user could be
-		 * backing multiple IOVAs with the same vaddr.  This over-
-		 * penalizes the user process, but we currently have no
-		 * easy way to do this properly.
+		 * We use the IOMMU to track the physical address.  This
+		 * saves us from having a lot more entries in our mapping
+		 * tree.  The downside is that we don't track the size
+		 * used to do the mapping.  We request unmap of a single
+		 * page, but expect IOMMUs that support large pages to
+		 * unmap a larger chunk.
 		 */
-		if (!is_invalid_reserved_pfn(pfn))
-			locked++;
-
-		ret = iommu_map(iommu->domain, iova,
-				(phys_addr_t)pfn << PAGE_SHIFT,
-				PAGE_SIZE, prot);
-		if (ret) {
-			/* Back out mappings on error */
-			put_pfn(pfn, prot);
-			__vfio_dma_do_unmap(iommu, start, i, prot);
-			return ret;
+		phys = iommu_iova_to_phys(iommu->domain, iova);
+		if (WARN_ON(!phys)) {
+			iova += PAGE_SIZE;
+			continue;
 		}
+
+		unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+		if (!unmapped)
+			break;
+
+		unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
+					     unmapped >> PAGE_SHIFT,
+					     dma->prot, false);
+		iova += unmapped;
 	}
-	vfio_lock_acct(locked);
+
+	vfio_lock_acct(-unlocked);
+
+	*size = iova - start;
+
 	return 0;
 }

 static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
-				   size_t size, struct vfio_dma *dma)
+				   size_t *size, struct vfio_dma *dma)
 {
+	size_t offset, overlap, tmp;
 	struct vfio_dma *split;
-	long npage_lo, npage_hi;
+	int ret;
+
+	/*
+	 * Existing dma region is completely covered, unmap all.  This is
+	 * the likely case since userspace tends to map and unmap buffers
+	 * in one shot rather than multiple mappings within a buffer.
+	 */
+	if (likely(start <= dma->iova &&
+		   start + *size >= dma->iova + dma->size)) {
+		*size = dma->size;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+		if (ret)
+			return ret;
+
+		/*
+		 * Did we remove more than we have?  Should never happen
+		 * since a vfio_dma is contiguous in iova and vaddr.
+		 */
+		WARN_ON(*size != dma->size);

-	/* Existing dma region is completely covered, unmap all */
-	if (start <= dma->iova &&
-	    start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
-		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
 		vfio_remove_dma(iommu, dma);
 		kfree(dma);
 		return 0;
@@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,

 	/* Overlap low address of existing range */
 	if (start <= dma->iova) {
-		size_t overlap;
+		overlap = start + *size - dma->iova;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+		if (ret)
+			return ret;

-		overlap = start + size - dma->iova;
-		npage_lo = overlap >> PAGE_SHIFT;
+		vfio_remove_dma(iommu, dma);

-		vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
+		/*
+		 * Check, we may have removed to whole vfio_dma.  If not
+		 * fixup and re-insert.
+		 */
+		if (overlap < dma->size) {
 			dma->iova += overlap;
 			dma->vaddr += overlap;
-		dma->npage -= npage_lo;
+			dma->size -= overlap;
+			vfio_insert_dma(iommu, dma);
+		}
+		*size = overlap;
 		return 0;
 	}

 	/* Overlap high address of existing range */
-	if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
-		size_t overlap;
+	if (start + *size >= dma->iova + dma->size) {
+		offset = start - dma->iova;
+		overlap = dma->size - offset;
+
+		ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+		if (ret)
+			return ret;

-		overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
-		npage_hi = overlap >> PAGE_SHIFT;
+		/*
+		 * We may have unmapped the entire vfio_dma if the user is
+		 * trying to unmap a sub-region of what was originally
+		 * mapped.  If anything left, we can resize in place since
+		 * iova is unchanged.
+		 */
+		if (overlap < dma->size)
+			dma->size -= overlap;
+		else
+			vfio_remove_dma(iommu, dma);

-		vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
-		dma->npage -= npage_hi;
+		*size = overlap;
 		return 0;
 	}

 	/* Split existing */
-	npage_lo = (start - dma->iova) >> PAGE_SHIFT;
-	npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
+	offset = start - dma->iova;

-	split = kzalloc(sizeof *split, GFP_KERNEL);
-	if (!split)
-		return -ENOMEM;
+	ret = vfio_unmap_unpin(iommu, dma, start, size);
+	if (ret)
+		return ret;
+
+	WARN_ON(!*size);
+	tmp = dma->size;

-	vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
+	/*
+	 * Resize the lower vfio_dma in place, insert new for remaining
+	 * upper segment.
+	 */
+	dma->size = offset;

-	dma->npage = npage_lo;
+	if (offset + *size < tmp) {
+		split = kzalloc(sizeof(*split), GFP_KERNEL);
+		if (!split)
+			return -ENOMEM;

-	split->npage = npage_hi;
-	split->iova = start + size;
-	split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
+		split->size = tmp - offset - *size;
+		split->iova = dma->iova + offset + *size;
+		split->vaddr = dma->vaddr + offset + *size;
 		split->prot = dma->prot;
 		vfio_insert_dma(iommu, split);
+	}
+
 	return 0;
 }

@@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 {
 	uint64_t mask;
 	struct vfio_dma *dma;
+	size_t unmapped = 0, size;
 	int ret = 0;

 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
@@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
 	if (unmap->size & mask)
 		return -EINVAL;

-	/* XXX We still break these down into PAGE_SIZE */
 	WARN_ON(mask & PAGE_MASK);

 	mutex_lock(&iommu->lock);

-	while (!ret && (dma = vfio_find_dma(iommu,
-					    unmap->iova, unmap->size)))
-		ret = vfio_remove_dma_overlap(iommu, unmap->iova,
-					      unmap->size, dma);
+	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+		size = unmap->size;
+		ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+		if (ret)
+			break;
+		unmapped += size;
+	}

 	mutex_unlock(&iommu->lock);
+
+	/*
+	 * We may unmap more than requested, update the unmap struct so
+	 * userspace can know.
+	 */
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+/*
+ * Turns out AMD IOMMU has a page table bug where it won't map large pages
+ * to a region that previously mapped smaller pages.  This should be fixed
+ * soon, so this is just a temporary workaround to break mappings down into
+ * PAGE_SIZE.  Better to map smaller pages than nothing.
+ */
+static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
+			  unsigned long pfn, long npage, int prot)
+{
+	long i;
+	int ret;
+
+	for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT,
+				PAGE_SIZE, prot);
+		if (ret)
+			break;
+	}
+
+	for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
+		iommu_unmap(iommu->domain, iova, PAGE_SIZE);
+
 	return ret;
 }

 static int vfio_dma_do_map(struct vfio_iommu *iommu,
 			   struct vfio_iommu_type1_dma_map *map)
 {
-	struct vfio_dma *dma;
-	dma_addr_t iova = map->iova;
-	unsigned long locked, lock_limit, vaddr = map->vaddr;
+	dma_addr_t end, iova;
+	unsigned long vaddr = map->vaddr;
 	size_t size = map->size;
+	long npage;
 	int ret = 0, prot = 0;
 	uint64_t mask;
-	long npage;
+
+	end = map->iova + map->size;

 	mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;

@@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
 	if (!prot)
 		return -EINVAL; /* No READ/WRITE? */

+	if (iommu->cache)
+		prot |= IOMMU_CACHE;
+
 	if (vaddr & mask)
 		return -EINVAL;
-	if (iova & mask)
+	if (map->iova & mask)
 		return -EINVAL;
-	if (size & mask)
+	if (!map->size || map->size & mask)
 		return -EINVAL;

-	/* XXX We still break these down into PAGE_SIZE */
 	WARN_ON(mask & PAGE_MASK);

 	/* Don't allow IOVA wrap */
-	if (iova + size && iova + size < iova)
+	if (end && end < map->iova)
 		return -EINVAL;

 	/* Don't allow virtual address wrap */
-	if (vaddr + size && vaddr + size < vaddr)
+	if (vaddr + map->size && vaddr + map->size < vaddr)
 		return -EINVAL;

-	npage = size >> PAGE_SHIFT;
-	if (!npage)
-		return -EINVAL;
+	mutex_lock(&iommu->lock);

-	dma = kzalloc(sizeof *dma, GFP_KERNEL);
-	if (!dma)
-		return -ENOMEM;
+	if (vfio_find_dma(iommu, map->iova, map->size)) {
+		mutex_unlock(&iommu->lock);
+		return -EEXIST;
+	}

-	mutex_lock(&iommu->lock);
+	for (iova = map->iova; iova < end; iova += size, vaddr += size) {
+		struct vfio_dma *dma = NULL;
+		unsigned long pfn;
+		long i;
+
+		/* Pin a contiguous chunk of memory */
+		npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
+				       prot, &pfn);
+		if (npage <= 0) {
+			WARN_ON(!npage);
+			ret = (int)npage;
+			break;
+		}

-	if (vfio_find_dma(iommu, iova, size)) {
+		/* Verify pages are not already mapped */
+		for (i = 0; i < npage; i++) {
+			if (iommu_iova_to_phys(iommu->domain,
+					       iova + (i << PAGE_SHIFT))) {
+				vfio_unpin_pages(pfn, npage, prot, true);
 				ret = -EBUSY;
-		goto out_lock;
+				break;
 			}
-
-	/* account for locked pages */
-	locked = current->mm->locked_vm + npage;
-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-		pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
-			__func__, rlimit(RLIMIT_MEMLOCK));
-		ret = -ENOMEM;
-		goto out_lock;
 		}

-	ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
-	if (ret)
-		goto out_lock;
+		ret = iommu_map(iommu->domain, iova,
+				(phys_addr_t)pfn << PAGE_SHIFT,
+				npage << PAGE_SHIFT, prot);
+		if (ret) {
+			if (ret != -EBUSY ||
+			    map_try_harder(iommu, iova, pfn, npage, prot)) {
+				vfio_unpin_pages(pfn, npage, prot, true);
+				break;
+			}
+		}

-	dma->npage = npage;
-	dma->iova = iova;
-	dma->vaddr = vaddr;
-	dma->prot = prot;
+		size = npage << PAGE_SHIFT;

-	/* Check if we abut a region below - nothing below 0 */
-	if (iova) {
-		struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
+		/*
+		 * Check if we abut a region below - nothing below 0.
+		 * This is the most likely case when mapping chunks of
+		 * physically contiguous regions within a virtual address
+		 * range.  Update the abutting entry in place since iova
+		 * doesn't change.
+		 */
+		if (likely(iova)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iova - 1, 1);
 			if (tmp && tmp->prot == prot &&
-		    tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
-			vfio_remove_dma(iommu, tmp);
-			dma->npage += tmp->npage;
-			dma->iova = iova = tmp->iova;
-			dma->vaddr = vaddr = tmp->vaddr;
-			kfree(tmp);
-			npage = dma->npage;
-			size = NPAGE_TO_SIZE(npage);
+			    tmp->vaddr + tmp->size == vaddr) {
+				tmp->size += size;
+
+				iova = tmp->iova;
+				size = tmp->size;
+				vaddr = tmp->vaddr;
+				dma = tmp;
 			}
 		}

 		/* Check if we abut a region above - nothing above ~0 + 1 */
-	if (iova + size) {
-		struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
+		if (likely(iova + size)) {
+			struct vfio_dma *tmp;
+
+			tmp = vfio_find_dma(iommu, iova + size, 1);
 			if (tmp && tmp->prot == prot &&
 			    tmp->vaddr == vaddr + size) {
 				vfio_remove_dma(iommu, tmp);
-			dma->npage += tmp->npage;
+				if (dma)
+					dma->size += tmp->size;
+				else
+					size += tmp->size;
 				kfree(tmp);
-			npage = dma->npage;
-			size = NPAGE_TO_SIZE(npage);
 			}
 		}

+		if (!dma) {
+			dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+			if (!dma) {
+				iommu_unmap(iommu->domain, iova, size);
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -ENOMEM;
+				break;
+			}
+
+			dma->size = size;
+			dma->iova = iova;
+			dma->vaddr = vaddr;
+			dma->prot = prot;
 			vfio_insert_dma(iommu, dma);
+		}
+	}
+
+	if (ret) {
+		struct vfio_dma *tmp;
+		iova = map->iova;
+		size = map->size;
+		while ((tmp = vfio_find_dma(iommu, iova, size))) {
+			if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
+				pr_warn("%s: Error rolling back failed map\n",
+					__func__);
+				break;
+			}
+		}
+	}

-out_lock:
 	mutex_unlock(&iommu->lock);
-	if (ret)
-		kfree(dma);
 	return ret;
 }

@@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data)

 	while ((node = rb_first(&iommu->dma_list))) {
 		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
-		vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
-		vfio_remove_dma(iommu, dma);
-		kfree(dma);
+		size_t size = dma->size;
+		vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
 	}

 	iommu_domain_free(iommu->domain);
@@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,

 	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
 		struct vfio_iommu_type1_dma_unmap unmap;
+		long ret;

 		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);

@@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		if (unmap.argsz < minsz || unmap.flags)
 			return -EINVAL;

-		return vfio_dma_do_unmap(iommu, &unmap);
+		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &unmap, minsz);
 	}

 	return -ENOTTY;

--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map {
 #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)

 /**
- * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
+ *							struct vfio_dma_unmap)
 *
 * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
- * Caller sets argsz.
+ * Caller sets argsz.  The actual unmapped size is returned in the size
+ * field.  No guarantee is made to the user that arbitrary unmaps of iova
+ * or size different from those used in the original mapping call will
+ * succeed.
 */
 struct vfio_iommu_type1_dma_unmap {
 	__u32	argsz;