Commit 166fd7d9 authored by Alex Williamson's avatar Alex Williamson

vfio: hugepage support for vfio_iommu_type1

We currently send all mappings to the iommu in PAGE_SIZE chunks,
which prevents the iommu from enabling support for larger page sizes.
We still need to pin pages, which means we step through them in
PAGE_SIZE chunks, but we can batch up contiguous physical memory
chunks to allow the iommu the opportunity to use larger pages.  The
approach here is a bit different that the one currently used for
legacy KVM device assignment.  Rather than looking at the vma page
size and using that as the maximum size to pass to the iommu, we
instead simply look at whether the next page is physically
contiguous.  This means we might ask the iommu to map a 4MB region,
while legacy KVM might limit itself to a maximum of 2MB.

Splitting our mapping path also allows us to be smarter about locked
memory because we can more easily unwind if the user attempts to
exceed the limit.  Therefore, rather than assuming that a mapping
will result in locked memory, we test each page as it is pinned to
determine whether it locks RAM vs an mmap'd MMIO region.  This should
result in better locking granularity and less locked page fudge
factors in userspace.

The unmap path uses the same algorithm as legacy KVM.  We don't want
to track the pfn for each mapping ourselves, but we need the pfn in
order to unpin pages.  We therefore ask the iommu for the iova to
physical address translation, ask it to unpin a page, and see how many
pages were actually unpinned.  iommus supporting large pages will
often return something bigger than a page here, which we know will be
physically contiguous and we can unpin a batch of pfns.  iommus not
supporting large mappings won't see an improvement in batching here as
they only unmap a page at a time.

With this change, we also make a clarification to the API for mapping
and unmapping DMA.  We can only guarantee unmaps at the same
granularity as used for the original mapping.  In other words,
unmapping a subregion of a previous mapping is not guaranteed and may
result in a larger or smaller unmapping than requested.  The size
field in the unmapping structure is updated to reflect this.
Previously this was unmodified on mapping, always returning the the
requested unmap size.  This is now updated to return the actual unmap
size on success, allowing userspace to appropriately track mappings.
Signed-off-by: default avatarAlex Williamson <alex.williamson@redhat.com>
parent cd9b2268
......@@ -60,7 +60,7 @@ struct vfio_dma {
struct rb_node node;
dma_addr_t iova; /* Device address */
unsigned long vaddr; /* Process virtual addr */
long npage; /* Number of pages */
size_t size; /* Map size (bytes) */
int prot; /* IOMMU_READ/WRITE */
};
......@@ -74,8 +74,6 @@ struct vfio_group {
* into DMA'ble space using the IOMMU
*/
#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT)
static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
dma_addr_t start, size_t size)
{
......@@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
if (start + size <= dma->iova)
node = node->rb_left;
else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage))
else if (start >= dma->iova + dma->size)
node = node->rb_right;
else
return dma;
......@@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
parent = *link;
dma = rb_entry(parent, struct vfio_dma, node);
if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova)
if (new->iova + new->size <= dma->iova)
link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
......@@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage)
struct vwork *vwork;
struct mm_struct *mm;
if (!current->mm)
return; /* process exited */
if (!current->mm || !npage)
return; /* process exited or nothing to do */
if (down_write_trylock(&current->mm->mmap_sem)) {
current->mm->locked_vm += npage;
......@@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot)
return 0;
}
/* Unmap DMA region */
static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
long npage, int prot)
{
long i, unlocked = 0;
for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
unsigned long pfn;
pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
if (pfn) {
iommu_unmap(iommu->domain, iova, PAGE_SIZE);
unlocked += put_pfn(pfn, prot);
}
}
return unlocked;
}
static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova,
long npage, int prot)
{
long unlocked;
unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot);
vfio_lock_acct(-unlocked);
}
static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
{
struct page *page[1];
......@@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn)
return ret;
}
/* Map DMA region */
static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
unsigned long vaddr, long npage, int prot)
/*
* Attempt to pin pages. We really don't want to track all the pfns and
* the iommu can only map chunks of consecutive pfns anyway, so get the
* first page and all consecutive pages with the same locking.
*/
static long vfio_pin_pages(unsigned long vaddr, long npage,
int prot, unsigned long *pfn_base)
{
dma_addr_t start = iova;
long i, locked = 0;
int ret;
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
bool lock_cap = capable(CAP_IPC_LOCK);
long ret, i;
if (!current->mm)
return -ENODEV;
/* Verify that pages are not already mapped */
for (i = 0; i < npage; i++, iova += PAGE_SIZE)
if (iommu_iova_to_phys(iommu->domain, iova))
return -EBUSY;
ret = vaddr_get_pfn(vaddr, prot, pfn_base);
if (ret)
return ret;
iova = start;
if (is_invalid_reserved_pfn(*pfn_base))
return 1;
if (iommu->cache)
prot |= IOMMU_CACHE;
if (!lock_cap && current->mm->locked_vm + 1 > limit) {
put_pfn(*pfn_base, prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
return -ENOMEM;
}
/*
* XXX We break mappings into pages and use get_user_pages_fast to
* pin the pages in memory. It's been suggested that mlock might
* provide a more efficient mechanism, but nothing prevents the
* user from munlocking the pages, which could then allow the user
* access to random host memory. We also have no guarantee from the
* IOMMU API that the iommu driver can unmap sub-pages of previous
* mappings. This means we might lose an entire range if a single
* page within it is unmapped. Single page mappings are inefficient,
* but provide the most flexibility for now.
*/
for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) {
/* Lock all the consecutive pages from pfn_base */
for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) {
unsigned long pfn = 0;
ret = vaddr_get_pfn(vaddr, prot, &pfn);
if (ret) {
__vfio_dma_do_unmap(iommu, start, i, prot);
return ret;
if (ret)
break;
if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) {
put_pfn(pfn, prot);
break;
}
if (!lock_cap && current->mm->locked_vm + i + 1 > limit) {
put_pfn(pfn, prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
break;
}
}
vfio_lock_acct(i);
return i;
}
static long vfio_unpin_pages(unsigned long pfn, long npage,
int prot, bool do_accounting)
{
unsigned long unlocked = 0;
long i;
for (i = 0; i < npage; i++)
unlocked += put_pfn(pfn++, prot);
if (do_accounting)
vfio_lock_acct(-unlocked);
return unlocked;
}
static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
dma_addr_t iova, size_t *size)
{
dma_addr_t start = iova, end = iova + *size;
long unlocked = 0;
while (iova < end) {
size_t unmapped;
phys_addr_t phys;
/*
* Only add actual locked pages to accounting
* XXX We're effectively marking a page locked for every
* IOVA page even though it's possible the user could be
* backing multiple IOVAs with the same vaddr. This over-
* penalizes the user process, but we currently have no
* easy way to do this properly.
* We use the IOMMU to track the physical address. This
* saves us from having a lot more entries in our mapping
* tree. The downside is that we don't track the size
* used to do the mapping. We request unmap of a single
* page, but expect IOMMUs that support large pages to
* unmap a larger chunk.
*/
if (!is_invalid_reserved_pfn(pfn))
locked++;
ret = iommu_map(iommu->domain, iova,
(phys_addr_t)pfn << PAGE_SHIFT,
PAGE_SIZE, prot);
if (ret) {
/* Back out mappings on error */
put_pfn(pfn, prot);
__vfio_dma_do_unmap(iommu, start, i, prot);
return ret;
phys = iommu_iova_to_phys(iommu->domain, iova);
if (WARN_ON(!phys)) {
iova += PAGE_SIZE;
continue;
}
unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE);
if (!unmapped)
break;
unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT,
unmapped >> PAGE_SHIFT,
dma->prot, false);
iova += unmapped;
}
vfio_lock_acct(locked);
vfio_lock_acct(-unlocked);
*size = iova - start;
return 0;
}
static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
size_t size, struct vfio_dma *dma)
size_t *size, struct vfio_dma *dma)
{
size_t offset, overlap, tmp;
struct vfio_dma *split;
long npage_lo, npage_hi;
int ret;
/*
* Existing dma region is completely covered, unmap all. This is
* the likely case since userspace tends to map and unmap buffers
* in one shot rather than multiple mappings within a buffer.
*/
if (likely(start <= dma->iova &&
start + *size >= dma->iova + dma->size)) {
*size = dma->size;
ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
if (ret)
return ret;
/*
* Did we remove more than we have? Should never happen
* since a vfio_dma is contiguous in iova and vaddr.
*/
WARN_ON(*size != dma->size);
/* Existing dma region is completely covered, unmap all */
if (start <= dma->iova &&
start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
vfio_remove_dma(iommu, dma);
kfree(dma);
return 0;
......@@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
/* Overlap low address of existing range */
if (start <= dma->iova) {
size_t overlap;
overlap = start + *size - dma->iova;
ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
if (ret)
return ret;
overlap = start + size - dma->iova;
npage_lo = overlap >> PAGE_SHIFT;
vfio_remove_dma(iommu, dma);
vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot);
/*
* Check, we may have removed to whole vfio_dma. If not
* fixup and re-insert.
*/
if (overlap < dma->size) {
dma->iova += overlap;
dma->vaddr += overlap;
dma->npage -= npage_lo;
dma->size -= overlap;
vfio_insert_dma(iommu, dma);
}
*size = overlap;
return 0;
}
/* Overlap high address of existing range */
if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) {
size_t overlap;
if (start + *size >= dma->iova + dma->size) {
offset = start - dma->iova;
overlap = dma->size - offset;
ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
if (ret)
return ret;
overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start;
npage_hi = overlap >> PAGE_SHIFT;
/*
* We may have unmapped the entire vfio_dma if the user is
* trying to unmap a sub-region of what was originally
* mapped. If anything left, we can resize in place since
* iova is unchanged.
*/
if (overlap < dma->size)
dma->size -= overlap;
else
vfio_remove_dma(iommu, dma);
vfio_dma_unmap(iommu, start, npage_hi, dma->prot);
dma->npage -= npage_hi;
*size = overlap;
return 0;
}
/* Split existing */
npage_lo = (start - dma->iova) >> PAGE_SHIFT;
npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo;
offset = start - dma->iova;
split = kzalloc(sizeof *split, GFP_KERNEL);
if (!split)
return -ENOMEM;
ret = vfio_unmap_unpin(iommu, dma, start, size);
if (ret)
return ret;
WARN_ON(!*size);
tmp = dma->size;
vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot);
/*
* Resize the lower vfio_dma in place, insert new for remaining
* upper segment.
*/
dma->size = offset;
dma->npage = npage_lo;
if (offset + *size < tmp) {
split = kzalloc(sizeof(*split), GFP_KERNEL);
if (!split)
return -ENOMEM;
split->npage = npage_hi;
split->iova = start + size;
split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size;
split->size = tmp - offset - *size;
split->iova = dma->iova + offset + *size;
split->vaddr = dma->vaddr + offset + *size;
split->prot = dma->prot;
vfio_insert_dma(iommu, split);
}
return 0;
}
......@@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
{
uint64_t mask;
struct vfio_dma *dma;
size_t unmapped = 0, size;
int ret = 0;
mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
......@@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
if (unmap->size & mask)
return -EINVAL;
/* XXX We still break these down into PAGE_SIZE */
WARN_ON(mask & PAGE_MASK);
mutex_lock(&iommu->lock);
while (!ret && (dma = vfio_find_dma(iommu,
unmap->iova, unmap->size)))
ret = vfio_remove_dma_overlap(iommu, unmap->iova,
unmap->size, dma);
while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
size = unmap->size;
ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
if (ret)
break;
unmapped += size;
}
mutex_unlock(&iommu->lock);
/*
* We may unmap more than requested, update the unmap struct so
* userspace can know.
*/
unmap->size = unmapped;
return ret;
}
/*
* Turns out AMD IOMMU has a page table bug where it won't map large pages
* to a region that previously mapped smaller pages. This should be fixed
* soon, so this is just a temporary workaround to break mappings down into
* PAGE_SIZE. Better to map smaller pages than nothing.
*/
static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova,
unsigned long pfn, long npage, int prot)
{
long i;
int ret;
for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
ret = iommu_map(iommu->domain, iova,
(phys_addr_t)pfn << PAGE_SHIFT,
PAGE_SIZE, prot);
if (ret)
break;
}
for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
iommu_unmap(iommu->domain, iova, PAGE_SIZE);
return ret;
}
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
struct vfio_dma *dma;
dma_addr_t iova = map->iova;
unsigned long locked, lock_limit, vaddr = map->vaddr;
dma_addr_t end, iova;
unsigned long vaddr = map->vaddr;
size_t size = map->size;
long npage;
int ret = 0, prot = 0;
uint64_t mask;
long npage;
end = map->iova + map->size;
mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1;
......@@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (!prot)
return -EINVAL; /* No READ/WRITE? */
if (iommu->cache)
prot |= IOMMU_CACHE;
if (vaddr & mask)
return -EINVAL;
if (iova & mask)
if (map->iova & mask)
return -EINVAL;
if (size & mask)
if (!map->size || map->size & mask)
return -EINVAL;
/* XXX We still break these down into PAGE_SIZE */
WARN_ON(mask & PAGE_MASK);
/* Don't allow IOVA wrap */
if (iova + size && iova + size < iova)
if (end && end < map->iova)
return -EINVAL;
/* Don't allow virtual address wrap */
if (vaddr + size && vaddr + size < vaddr)
if (vaddr + map->size && vaddr + map->size < vaddr)
return -EINVAL;
npage = size >> PAGE_SHIFT;
if (!npage)
return -EINVAL;
mutex_lock(&iommu->lock);
dma = kzalloc(sizeof *dma, GFP_KERNEL);
if (!dma)
return -ENOMEM;
if (vfio_find_dma(iommu, map->iova, map->size)) {
mutex_unlock(&iommu->lock);
return -EEXIST;
}
mutex_lock(&iommu->lock);
for (iova = map->iova; iova < end; iova += size, vaddr += size) {
struct vfio_dma *dma = NULL;
unsigned long pfn;
long i;
/* Pin a contiguous chunk of memory */
npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT,
prot, &pfn);
if (npage <= 0) {
WARN_ON(!npage);
ret = (int)npage;
break;
}
if (vfio_find_dma(iommu, iova, size)) {
/* Verify pages are not already mapped */
for (i = 0; i < npage; i++) {
if (iommu_iova_to_phys(iommu->domain,
iova + (i << PAGE_SHIFT))) {
vfio_unpin_pages(pfn, npage, prot, true);
ret = -EBUSY;
goto out_lock;
break;
}
/* account for locked pages */
locked = current->mm->locked_vm + npage;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, rlimit(RLIMIT_MEMLOCK));
ret = -ENOMEM;
goto out_lock;
}
ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot);
if (ret)
goto out_lock;
ret = iommu_map(iommu->domain, iova,
(phys_addr_t)pfn << PAGE_SHIFT,
npage << PAGE_SHIFT, prot);
if (ret) {
if (ret != -EBUSY ||
map_try_harder(iommu, iova, pfn, npage, prot)) {
vfio_unpin_pages(pfn, npage, prot, true);
break;
}
}
dma->npage = npage;
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
size = npage << PAGE_SHIFT;
/* Check if we abut a region below - nothing below 0 */
if (iova) {
struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1);
/*
* Check if we abut a region below - nothing below 0.
* This is the most likely case when mapping chunks of
* physically contiguous regions within a virtual address
* range. Update the abutting entry in place since iova
* doesn't change.
*/
if (likely(iova)) {
struct vfio_dma *tmp;
tmp = vfio_find_dma(iommu, iova - 1, 1);
if (tmp && tmp->prot == prot &&
tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) {
vfio_remove_dma(iommu, tmp);
dma->npage += tmp->npage;
dma->iova = iova = tmp->iova;
dma->vaddr = vaddr = tmp->vaddr;
kfree(tmp);
npage = dma->npage;
size = NPAGE_TO_SIZE(npage);
tmp->vaddr + tmp->size == vaddr) {
tmp->size += size;
iova = tmp->iova;
size = tmp->size;
vaddr = tmp->vaddr;
dma = tmp;
}
}
/* Check if we abut a region above - nothing above ~0 + 1 */
if (iova + size) {
struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1);
if (likely(iova + size)) {
struct vfio_dma *tmp;
tmp = vfio_find_dma(iommu, iova + size, 1);
if (tmp && tmp->prot == prot &&
tmp->vaddr == vaddr + size) {
vfio_remove_dma(iommu, tmp);
dma->npage += tmp->npage;
if (dma)
dma->size += tmp->size;
else
size += tmp->size;
kfree(tmp);
npage = dma->npage;
size = NPAGE_TO_SIZE(npage);
}
}
if (!dma) {
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
iommu_unmap(iommu->domain, iova, size);
vfio_unpin_pages(pfn, npage, prot, true);
ret = -ENOMEM;
break;
}
dma->size = size;
dma->iova = iova;
dma->vaddr = vaddr;
dma->prot = prot;
vfio_insert_dma(iommu, dma);
}
}
if (ret) {
struct vfio_dma *tmp;
iova = map->iova;
size = map->size;
while ((tmp = vfio_find_dma(iommu, iova, size))) {
if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) {
pr_warn("%s: Error rolling back failed map\n",
__func__);
break;
}
}
}
out_lock:
mutex_unlock(&iommu->lock);
if (ret)
kfree(dma);
return ret;
}
......@@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data)
while ((node = rb_first(&iommu->dma_list))) {
struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot);
vfio_remove_dma(iommu, dma);
kfree(dma);
size_t size = dma->size;
vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
}
iommu_domain_free(iommu->domain);
......@@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
struct vfio_iommu_type1_dma_unmap unmap;
long ret;
minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
......@@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
if (unmap.argsz < minsz || unmap.flags)
return -EINVAL;
return vfio_dma_do_unmap(iommu, &unmap);
ret = vfio_dma_do_unmap(iommu, &unmap);
if (ret)
return ret;
return copy_to_user((void __user *)arg, &unmap, minsz);
}
return -ENOTTY;
......
......@@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map {
#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
/**
* VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
* VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14,
* struct vfio_dma_unmap)
*
* Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
* Caller sets argsz.
* Caller sets argsz. The actual unmapped size is returned in the size
* field. No guarantee is made to the user that arbitrary unmaps of iova
* or size different from those used in the original mapping call will
* succeed.
*/
struct vfio_iommu_type1_dma_unmap {
__u32 argsz;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment