Commit fffe3ae0 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull hmm updates from Jason Gunthorpe:
 "Ralph has been working on nouveau's use of hmm_range_fault() and
  migrate_vma() which resulted in this small series. It adds reporting
  of the page table order from hmm_range_fault() and some optimization
  of migrate_vma():

   - Report the size of the page table mapping out of hmm_range_fault().

     This makes it easier to establish a large/huge/etc mapping in the
     device's page table.

   - Allow devices to ignore the invalidations during migration in cases
     where the migration is not going to change pages.

     For instance migrating pages to a device does not require the
     device to invalidate pages already in the device.

   - Update nouveau and hmm_tests to use the above"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
  mm/hmm/test: use the new migration invalidation
  nouveau/svm: use the new migration invalidation
  mm/notifier: add migration invalidation type
  mm/migrate: add a flags parameter to migrate_vma
  nouveau: fix storing invalid ptes
  nouveau/hmm: support mapping large sysmem pages
  nouveau: fix mapping 2MB sysmem pages
  nouveau/hmm: fault one page at a time
  mm/hmm: add tests for hmm_pfn_to_map_order()
  mm/hmm: provide the page mapping order in hmm_range_fault()
parents 8f7be629 7d17e83a
...@@ -400,6 +400,7 @@ kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start, ...@@ -400,6 +400,7 @@ kvmppc_svm_page_in(struct vm_area_struct *vma, unsigned long start,
mig.end = end; mig.end = end;
mig.src = &src_pfn; mig.src = &src_pfn;
mig.dst = &dst_pfn; mig.dst = &dst_pfn;
mig.flags = MIGRATE_VMA_SELECT_SYSTEM;
/* /*
* We come here with mmap_lock write lock held just for * We come here with mmap_lock write lock held just for
...@@ -577,7 +578,8 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, ...@@ -577,7 +578,8 @@ kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start,
mig.end = end; mig.end = end;
mig.src = &src_pfn; mig.src = &src_pfn;
mig.dst = &dst_pfn; mig.dst = &dst_pfn;
mig.src_owner = &kvmppc_uvmem_pgmap; mig.pgmap_owner = &kvmppc_uvmem_pgmap;
mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
mutex_lock(&kvm->arch.uvmem_lock); mutex_lock(&kvm->arch.uvmem_lock);
/* The requested page is already paged-out, nothing to do */ /* The requested page is already paged-out, nothing to do */
......
...@@ -140,6 +140,7 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, ...@@ -140,6 +140,7 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
{ {
struct device *dev = drm->dev->dev; struct device *dev = drm->dev->dev;
struct page *dpage, *spage; struct page *dpage, *spage;
struct nouveau_svmm *svmm;
spage = migrate_pfn_to_page(args->src[0]); spage = migrate_pfn_to_page(args->src[0]);
if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE))
...@@ -154,14 +155,19 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, ...@@ -154,14 +155,19 @@ static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
if (dma_mapping_error(dev, *dma_addr)) if (dma_mapping_error(dev, *dma_addr))
goto error_free_page; goto error_free_page;
svmm = spage->zone_device_data;
mutex_lock(&svmm->mutex);
nouveau_svmm_invalidate(svmm, args->start, args->end);
if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr, if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_HOST, *dma_addr,
NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage))) NOUVEAU_APER_VRAM, nouveau_dmem_page_addr(spage)))
goto error_dma_unmap; goto error_dma_unmap;
mutex_unlock(&svmm->mutex);
args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; args->dst[0] = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
return 0; return 0;
error_dma_unmap: error_dma_unmap:
mutex_unlock(&svmm->mutex);
dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
error_free_page: error_free_page:
__free_page(dpage); __free_page(dpage);
...@@ -182,7 +188,8 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf) ...@@ -182,7 +188,8 @@ static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
.end = vmf->address + PAGE_SIZE, .end = vmf->address + PAGE_SIZE,
.src = &src, .src = &src,
.dst = &dst, .dst = &dst,
.src_owner = drm->dev, .pgmap_owner = drm->dev,
.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE,
}; };
/* /*
...@@ -530,7 +537,8 @@ nouveau_dmem_init(struct nouveau_drm *drm) ...@@ -530,7 +537,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
} }
static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
unsigned long src, dma_addr_t *dma_addr, u64 *pfn) struct nouveau_svmm *svmm, unsigned long src,
dma_addr_t *dma_addr, u64 *pfn)
{ {
struct device *dev = drm->dev->dev; struct device *dev = drm->dev->dev;
struct page *dpage, *spage; struct page *dpage, *spage;
...@@ -560,6 +568,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, ...@@ -560,6 +568,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
goto out_free_page; goto out_free_page;
} }
dpage->zone_device_data = svmm;
*pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM | *pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM |
((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT); ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
if (src & MIGRATE_PFN_WRITE) if (src & MIGRATE_PFN_WRITE)
...@@ -583,8 +592,8 @@ static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm, ...@@ -583,8 +592,8 @@ static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm,
unsigned long addr = args->start, nr_dma = 0, i; unsigned long addr = args->start, nr_dma = 0, i;
for (i = 0; addr < args->end; i++) { for (i = 0; addr < args->end; i++) {
args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i], args->dst[i] = nouveau_dmem_migrate_copy_one(drm, svmm,
dma_addrs + nr_dma, pfns + i); args->src[i], dma_addrs + nr_dma, pfns + i);
if (!dma_mapping_error(drm->dev->dev, dma_addrs[nr_dma])) if (!dma_mapping_error(drm->dev->dev, dma_addrs[nr_dma]))
nr_dma++; nr_dma++;
addr += PAGE_SIZE; addr += PAGE_SIZE;
...@@ -615,6 +624,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, ...@@ -615,6 +624,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
struct migrate_vma args = { struct migrate_vma args = {
.vma = vma, .vma = vma,
.start = start, .start = start,
.pgmap_owner = drm->dev,
.flags = MIGRATE_VMA_SELECT_SYSTEM,
}; };
unsigned long i; unsigned long i;
u64 *pfns; u64 *pfns;
......
...@@ -93,17 +93,6 @@ nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst) ...@@ -93,17 +93,6 @@ nouveau_ivmm_find(struct nouveau_svm *svm, u64 inst)
return NULL; return NULL;
} }
struct nouveau_svmm {
struct mmu_notifier notifier;
struct nouveau_vmm *vmm;
struct {
unsigned long start;
unsigned long limit;
} unmanaged;
struct mutex mutex;
};
#define SVMM_DBG(s,f,a...) \ #define SVMM_DBG(s,f,a...) \
NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a) NV_DEBUG((s)->vmm->cli->drm, "svm-%p: "f"\n", (s), ##a)
#define SVMM_ERR(s,f,a...) \ #define SVMM_ERR(s,f,a...) \
...@@ -246,7 +235,7 @@ nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst) ...@@ -246,7 +235,7 @@ nouveau_svmm_join(struct nouveau_svmm *svmm, u64 inst)
} }
/* Invalidate SVMM address-range on GPU. */ /* Invalidate SVMM address-range on GPU. */
static void void
nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit)
{ {
if (limit > start) { if (limit > start) {
...@@ -279,6 +268,14 @@ nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn, ...@@ -279,6 +268,14 @@ nouveau_svmm_invalidate_range_start(struct mmu_notifier *mn,
if (unlikely(!svmm->vmm)) if (unlikely(!svmm->vmm))
goto out; goto out;
/*
* Ignore invalidation callbacks for device private pages since
* the invalidation is handled as part of the migration process.
*/
if (update->event == MMU_NOTIFY_MIGRATE &&
update->migrate_pgmap_owner == svmm->vmm->cli->drm->dev)
goto out;
if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) { if (limit > svmm->unmanaged.start && start < svmm->unmanaged.limit) {
if (start < svmm->unmanaged.start) { if (start < svmm->unmanaged.start) {
nouveau_svmm_invalidate(svmm, start, nouveau_svmm_invalidate(svmm, start,
...@@ -514,53 +511,68 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = { ...@@ -514,53 +511,68 @@ static const struct mmu_interval_notifier_ops nouveau_svm_mni_ops = {
}; };
static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm, static void nouveau_hmm_convert_pfn(struct nouveau_drm *drm,
struct hmm_range *range, u64 *ioctl_addr) struct hmm_range *range,
struct nouveau_pfnmap_args *args)
{ {
unsigned long i, npages; struct page *page;
/* /*
* The ioctl_addr prepared here is passed through nvif_object_ioctl() * The address prepared here is passed through nvif_object_ioctl()
* to an eventual DMA map in something like gp100_vmm_pgt_pfn() * to an eventual DMA map in something like gp100_vmm_pgt_pfn()
* *
* This is all just encoding the internal hmm representation into a * This is all just encoding the internal hmm representation into a
* different nouveau internal representation. * different nouveau internal representation.
*/ */
npages = (range->end - range->start) >> PAGE_SHIFT; if (!(range->hmm_pfns[0] & HMM_PFN_VALID)) {
for (i = 0; i < npages; ++i) { args->p.phys[0] = 0;
struct page *page; return;
if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
ioctl_addr[i] = 0;
continue;
} }
page = hmm_pfn_to_page(range->hmm_pfns[i]); page = hmm_pfn_to_page(range->hmm_pfns[0]);
/*
* Only map compound pages to the GPU if the CPU is also mapping the
* page as a compound page. Otherwise, the PTE protections might not be
* consistent (e.g., CPU only maps part of a compound page).
* Note that the underlying page might still be larger than the
* CPU mapping (e.g., a PUD sized compound page partially mapped with
* a PMD sized page table entry).
*/
if (hmm_pfn_to_map_order(range->hmm_pfns[0])) {
unsigned long addr = args->p.addr;
args->p.page = hmm_pfn_to_map_order(range->hmm_pfns[0]) +
PAGE_SHIFT;
args->p.size = 1UL << args->p.page;
args->p.addr &= ~(args->p.size - 1);
page -= (addr - args->p.addr) >> PAGE_SHIFT;
}
if (is_device_private_page(page)) if (is_device_private_page(page))
ioctl_addr[i] = nouveau_dmem_page_addr(page) | args->p.phys[0] = nouveau_dmem_page_addr(page) |
NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_V |
NVIF_VMM_PFNMAP_V0_VRAM; NVIF_VMM_PFNMAP_V0_VRAM;
else else
ioctl_addr[i] = page_to_phys(page) | args->p.phys[0] = page_to_phys(page) |
NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_V |
NVIF_VMM_PFNMAP_V0_HOST; NVIF_VMM_PFNMAP_V0_HOST;
if (range->hmm_pfns[i] & HMM_PFN_WRITE) if (range->hmm_pfns[0] & HMM_PFN_WRITE)
ioctl_addr[i] |= NVIF_VMM_PFNMAP_V0_W; args->p.phys[0] |= NVIF_VMM_PFNMAP_V0_W;
}
} }
static int nouveau_range_fault(struct nouveau_svmm *svmm, static int nouveau_range_fault(struct nouveau_svmm *svmm,
struct nouveau_drm *drm, void *data, u32 size, struct nouveau_drm *drm,
unsigned long hmm_pfns[], u64 *ioctl_addr, struct nouveau_pfnmap_args *args, u32 size,
unsigned long hmm_flags,
struct svm_notifier *notifier) struct svm_notifier *notifier)
{ {
unsigned long timeout = unsigned long timeout =
jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
/* Have HMM fault pages within the fault window to the GPU. */ /* Have HMM fault pages within the fault window to the GPU. */
unsigned long hmm_pfns[1];
struct hmm_range range = { struct hmm_range range = {
.notifier = &notifier->notifier, .notifier = &notifier->notifier,
.start = notifier->notifier.interval_tree.start, .start = notifier->notifier.interval_tree.start,
.end = notifier->notifier.interval_tree.last + 1, .end = notifier->notifier.interval_tree.last + 1,
.pfn_flags_mask = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE, .default_flags = hmm_flags,
.hmm_pfns = hmm_pfns, .hmm_pfns = hmm_pfns,
.dev_private_owner = drm->dev, .dev_private_owner = drm->dev,
}; };
...@@ -576,11 +588,6 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, ...@@ -576,11 +588,6 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
ret = hmm_range_fault(&range); ret = hmm_range_fault(&range);
mmap_read_unlock(mm); mmap_read_unlock(mm);
if (ret) { if (ret) {
/*
* FIXME: the input PFN_REQ flags are destroyed on
* -EBUSY, we need to regenerate them, also for the
* other continue below
*/
if (ret == -EBUSY) if (ret == -EBUSY)
continue; continue;
return ret; return ret;
...@@ -595,10 +602,10 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm, ...@@ -595,10 +602,10 @@ static int nouveau_range_fault(struct nouveau_svmm *svmm,
break; break;
} }
nouveau_hmm_convert_pfn(drm, &range, ioctl_addr); nouveau_hmm_convert_pfn(drm, &range, args);
svmm->vmm->vmm.object.client->super = true; svmm->vmm->vmm.object.client->super = true;
ret = nvif_object_ioctl(&svmm->vmm->vmm.object, data, size, NULL); ret = nvif_object_ioctl(&svmm->vmm->vmm.object, args, size, NULL);
svmm->vmm->vmm.object.client->super = false; svmm->vmm->vmm.object.client->super = false;
mutex_unlock(&svmm->mutex); mutex_unlock(&svmm->mutex);
...@@ -615,17 +622,12 @@ nouveau_svm_fault(struct nvif_notify *notify) ...@@ -615,17 +622,12 @@ nouveau_svm_fault(struct nvif_notify *notify)
struct nvif_object *device = &svm->drm->client.device.object; struct nvif_object *device = &svm->drm->client.device.object;
struct nouveau_svmm *svmm; struct nouveau_svmm *svmm;
struct { struct {
struct { struct nouveau_pfnmap_args i;
struct nvif_ioctl_v0 i; u64 phys[1];
struct nvif_ioctl_mthd_v0 m;
struct nvif_vmm_pfnmap_v0 p;
} i;
u64 phys[16];
} args; } args;
unsigned long hmm_pfns[ARRAY_SIZE(args.phys)]; unsigned long hmm_flags;
struct vm_area_struct *vma;
u64 inst, start, limit; u64 inst, start, limit;
int fi, fn, pi, fill; int fi, fn;
int replay = 0, ret; int replay = 0, ret;
/* Parse available fault buffer entries into a cache, and update /* Parse available fault buffer entries into a cache, and update
...@@ -692,129 +694,84 @@ nouveau_svm_fault(struct nvif_notify *notify) ...@@ -692,129 +694,84 @@ nouveau_svm_fault(struct nvif_notify *notify)
* window into a single update. * window into a single update.
*/ */
start = buffer->fault[fi]->addr; start = buffer->fault[fi]->addr;
limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT); limit = start + PAGE_SIZE;
if (start < svmm->unmanaged.limit) if (start < svmm->unmanaged.limit)
limit = min_t(u64, limit, svmm->unmanaged.start); limit = min_t(u64, limit, svmm->unmanaged.start);
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
mm = svmm->notifier.mm;
if (!mmget_not_zero(mm)) {
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
continue;
}
/* Intersect fault window with the CPU VMA, cancelling
* the fault if the address is invalid.
*/
mmap_read_lock(mm);
vma = find_vma_intersection(mm, start, limit);
if (!vma) {
SVMM_ERR(svmm, "wndw %016llx-%016llx", start, limit);
mmap_read_unlock(mm);
mmput(mm);
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
continue;
}
start = max_t(u64, start, vma->vm_start);
limit = min_t(u64, limit, vma->vm_end);
mmap_read_unlock(mm);
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
if (buffer->fault[fi]->addr != start) {
SVMM_ERR(svmm, "addr %016llx", buffer->fault[fi]->addr);
mmput(mm);
nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
continue;
}
/* Prepare the GPU-side update of all pages within the /*
* Prepare the GPU-side update of all pages within the
* fault window, determining required pages and access * fault window, determining required pages and access
* permissions based on pending faults. * permissions based on pending faults.
*/ */
args.i.p.page = PAGE_SHIFT;
args.i.p.addr = start; args.i.p.addr = start;
for (fn = fi, pi = 0;;) { args.i.p.page = PAGE_SHIFT;
/* Determine required permissions based on GPU fault args.i.p.size = PAGE_SIZE;
/*
* Determine required permissions based on GPU fault
* access flags. * access flags.
*XXX: atomic? * XXX: atomic?
*/ */
switch (buffer->fault[fn]->access) { switch (buffer->fault[fi]->access) {
case 0: /* READ. */ case 0: /* READ. */
hmm_pfns[pi++] = HMM_PFN_REQ_FAULT; hmm_flags = HMM_PFN_REQ_FAULT;
break; break;
case 3: /* PREFETCH. */ case 3: /* PREFETCH. */
hmm_pfns[pi++] = 0; hmm_flags = 0;
break; break;
default: default:
hmm_pfns[pi++] = HMM_PFN_REQ_FAULT | hmm_flags = HMM_PFN_REQ_FAULT | HMM_PFN_REQ_WRITE;
HMM_PFN_REQ_WRITE;
break; break;
} }
args.i.p.size = pi << PAGE_SHIFT;
/* It's okay to skip over duplicate addresses from the
* same SVMM as faults are ordered by access type such
* that only the first one needs to be handled.
*
* ie. WRITE faults appear first, thus any handling of
* pending READ faults will already be satisfied.
*/
while (++fn < buffer->fault_nr &&
buffer->fault[fn]->svmm == svmm &&
buffer->fault[fn ]->addr ==
buffer->fault[fn - 1]->addr);
/* If the next fault is outside the window, or all GPU mm = svmm->notifier.mm;
* faults have been dealt with, we're done here. if (!mmget_not_zero(mm)) {
*/ nouveau_svm_fault_cancel_fault(svm, buffer->fault[fi]);
if (fn >= buffer->fault_nr || continue;
buffer->fault[fn]->svmm != svmm ||
buffer->fault[fn]->addr >= limit)
break;
/* Fill in the gap between this fault and the next. */
fill = (buffer->fault[fn ]->addr -
buffer->fault[fn - 1]->addr) >> PAGE_SHIFT;
while (--fill)
hmm_pfns[pi++] = 0;
} }
SVMM_DBG(svmm, "wndw %016llx-%016llx covering %d fault(s)",
args.i.p.addr,
args.i.p.addr + args.i.p.size, fn - fi);
notifier.svmm = svmm; notifier.svmm = svmm;
ret = mmu_interval_notifier_insert(&notifier.notifier, ret = mmu_interval_notifier_insert(&notifier.notifier, mm,
svmm->notifier.mm,
args.i.p.addr, args.i.p.size, args.i.p.addr, args.i.p.size,
&nouveau_svm_mni_ops); &nouveau_svm_mni_ops);
if (!ret) { if (!ret) {
ret = nouveau_range_fault( ret = nouveau_range_fault(svmm, svm->drm, &args.i,
svmm, svm->drm, &args, sizeof(args), hmm_flags, &notifier);
sizeof(args.i) + pi * sizeof(args.phys[0]),
hmm_pfns, args.phys, &notifier);
mmu_interval_notifier_remove(&notifier.notifier); mmu_interval_notifier_remove(&notifier.notifier);
} }
mmput(mm); mmput(mm);
/* Cancel any faults in the window whose pages didn't manage limit = args.i.p.addr + args.i.p.size;
* to keep their valid bit, or stay writeable when required. for (fn = fi; ++fn < buffer->fault_nr; ) {
/* It's okay to skip over duplicate addresses from the
* same SVMM as faults are ordered by access type such
* that only the first one needs to be handled.
* *
* If handling failed completely, cancel all faults. * ie. WRITE faults appear first, thus any handling of
* pending READ faults will already be satisfied.
* But if a large page is mapped, make sure subsequent
* fault addresses have sufficient access permission.
*/ */
if (buffer->fault[fn]->svmm != svmm ||
buffer->fault[fn]->addr >= limit ||
(buffer->fault[fi]->access == 0 /* READ. */ &&
!(args.phys[0] & NVIF_VMM_PFNMAP_V0_V)) ||
(buffer->fault[fi]->access != 0 /* READ. */ &&
buffer->fault[fi]->access != 3 /* PREFETCH. */ &&
!(args.phys[0] & NVIF_VMM_PFNMAP_V0_W)))
break;
}
/* If handling failed completely, cancel all faults. */
if (ret) {
while (fi < fn) { while (fi < fn) {
struct nouveau_svm_fault *fault = buffer->fault[fi++]; struct nouveau_svm_fault *fault =
pi = (fault->addr - args.i.p.addr) >> PAGE_SHIFT; buffer->fault[fi++];
if (ret ||
!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_V) ||
(!(args.phys[pi] & NVIF_VMM_PFNMAP_V0_W) &&
fault->access != 0 && fault->access != 3)) {
nouveau_svm_fault_cancel_fault(svm, fault); nouveau_svm_fault_cancel_fault(svm, fault);
continue;
} }
} else
replay++; replay++;
} }
}
/* Issue fault replay to the GPU. */ /* Issue fault replay to the GPU. */
if (replay) if (replay)
......
#ifndef __NOUVEAU_SVM_H__ #ifndef __NOUVEAU_SVM_H__
#define __NOUVEAU_SVM_H__ #define __NOUVEAU_SVM_H__
#include <nvif/os.h> #include <nvif/os.h>
#include <linux/mmu_notifier.h>
struct drm_device; struct drm_device;
struct drm_file; struct drm_file;
struct nouveau_drm; struct nouveau_drm;
struct nouveau_svmm; struct nouveau_svmm {
struct mmu_notifier notifier;
struct nouveau_vmm *vmm;
struct {
unsigned long start;
unsigned long limit;
} unmanaged;
struct mutex mutex;
};
#if IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) #if IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM)
void nouveau_svm_init(struct nouveau_drm *); void nouveau_svm_init(struct nouveau_drm *);
...@@ -19,6 +29,7 @@ int nouveau_svmm_join(struct nouveau_svmm *, u64 inst); ...@@ -19,6 +29,7 @@ int nouveau_svmm_join(struct nouveau_svmm *, u64 inst);
void nouveau_svmm_part(struct nouveau_svmm *, u64 inst); void nouveau_svmm_part(struct nouveau_svmm *, u64 inst);
int nouveau_svmm_bind(struct drm_device *, void *, struct drm_file *); int nouveau_svmm_bind(struct drm_device *, void *, struct drm_file *);
void nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit);
u64 *nouveau_pfns_alloc(unsigned long npages); u64 *nouveau_pfns_alloc(unsigned long npages);
void nouveau_pfns_free(u64 *pfns); void nouveau_pfns_free(u64 *pfns);
void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm, void nouveau_pfns_map(struct nouveau_svmm *svmm, struct mm_struct *mm,
......
...@@ -1204,7 +1204,6 @@ nvkm_vmm_pfn_unmap(struct nvkm_vmm *vmm, u64 addr, u64 size) ...@@ -1204,7 +1204,6 @@ nvkm_vmm_pfn_unmap(struct nvkm_vmm *vmm, u64 addr, u64 size)
/*TODO: /*TODO:
* - Avoid PT readback (for dma_unmap etc), this might end up being dealt * - Avoid PT readback (for dma_unmap etc), this might end up being dealt
* with inside HMM, which would be a lot nicer for us to deal with. * with inside HMM, which would be a lot nicer for us to deal with.
* - Multiple page sizes (particularly for huge page support).
* - Support for systems without a 4KiB page size. * - Support for systems without a 4KiB page size.
*/ */
int int
...@@ -1220,8 +1219,8 @@ nvkm_vmm_pfn_map(struct nvkm_vmm *vmm, u8 shift, u64 addr, u64 size, u64 *pfn) ...@@ -1220,8 +1219,8 @@ nvkm_vmm_pfn_map(struct nvkm_vmm *vmm, u8 shift, u64 addr, u64 size, u64 *pfn)
/* Only support mapping where the page size of the incoming page /* Only support mapping where the page size of the incoming page
* array matches a page size available for direct mapping. * array matches a page size available for direct mapping.
*/ */
while (page->shift && page->shift != shift && while (page->shift && (page->shift != shift ||
page->desc->func->pfn == NULL) page->desc->func->pfn == NULL))
page++; page++;
if (!page->shift || !IS_ALIGNED(addr, 1ULL << shift) || if (!page->shift || !IS_ALIGNED(addr, 1ULL << shift) ||
......
...@@ -79,8 +79,12 @@ gp100_vmm_pgt_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt, ...@@ -79,8 +79,12 @@ gp100_vmm_pgt_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
dma_addr_t addr; dma_addr_t addr;
nvkm_kmap(pt->memory); nvkm_kmap(pt->memory);
while (ptes--) { for (; ptes; ptes--, map->pfn++) {
u64 data = 0; u64 data = 0;
if (!(*map->pfn & NVKM_VMM_PFN_V))
continue;
if (!(*map->pfn & NVKM_VMM_PFN_W)) if (!(*map->pfn & NVKM_VMM_PFN_W))
data |= BIT_ULL(6); /* RO. */ data |= BIT_ULL(6); /* RO. */
...@@ -100,7 +104,6 @@ gp100_vmm_pgt_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt, ...@@ -100,7 +104,6 @@ gp100_vmm_pgt_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
} }
VMM_WO064(pt, vmm, ptei++ * 8, data); VMM_WO064(pt, vmm, ptei++ * 8, data);
map->pfn++;
} }
nvkm_done(pt->memory); nvkm_done(pt->memory);
} }
...@@ -258,12 +261,96 @@ gp100_vmm_pd0_unmap(struct nvkm_vmm *vmm, ...@@ -258,12 +261,96 @@ gp100_vmm_pd0_unmap(struct nvkm_vmm *vmm,
VMM_FO128(pt, vmm, pdei * 0x10, 0ULL, 0ULL, pdes); VMM_FO128(pt, vmm, pdei * 0x10, 0ULL, 0ULL, pdes);
} }
static void
gp100_vmm_pd0_pfn_unmap(struct nvkm_vmm *vmm,
struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
{
struct device *dev = vmm->mmu->subdev.device->dev;
dma_addr_t addr;
nvkm_kmap(pt->memory);
while (ptes--) {
u32 datalo = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 0);
u32 datahi = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 4);
u64 data = (u64)datahi << 32 | datalo;
if ((data & (3ULL << 1)) != 0) {
addr = (data >> 8) << 12;
dma_unmap_page(dev, addr, 1UL << 21, DMA_BIDIRECTIONAL);
}
ptei++;
}
nvkm_done(pt->memory);
}
static bool
gp100_vmm_pd0_pfn_clear(struct nvkm_vmm *vmm,
struct nvkm_mmu_pt *pt, u32 ptei, u32 ptes)
{
bool dma = false;
nvkm_kmap(pt->memory);
while (ptes--) {
u32 datalo = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 0);
u32 datahi = nvkm_ro32(pt->memory, pt->base + ptei * 16 + 4);
u64 data = (u64)datahi << 32 | datalo;
if ((data & BIT_ULL(0)) && (data & (3ULL << 1)) != 0) {
VMM_WO064(pt, vmm, ptei * 16, data & ~BIT_ULL(0));
dma = true;
}
ptei++;
}
nvkm_done(pt->memory);
return dma;
}
static void
gp100_vmm_pd0_pfn(struct nvkm_vmm *vmm, struct nvkm_mmu_pt *pt,
u32 ptei, u32 ptes, struct nvkm_vmm_map *map)
{
struct device *dev = vmm->mmu->subdev.device->dev;
dma_addr_t addr;
nvkm_kmap(pt->memory);
for (; ptes; ptes--, map->pfn++) {
u64 data = 0;
if (!(*map->pfn & NVKM_VMM_PFN_V))
continue;
if (!(*map->pfn & NVKM_VMM_PFN_W))
data |= BIT_ULL(6); /* RO. */
if (!(*map->pfn & NVKM_VMM_PFN_VRAM)) {
addr = *map->pfn >> NVKM_VMM_PFN_ADDR_SHIFT;
addr = dma_map_page(dev, pfn_to_page(addr), 0,
1UL << 21, DMA_BIDIRECTIONAL);
if (!WARN_ON(dma_mapping_error(dev, addr))) {
data |= addr >> 4;
data |= 2ULL << 1; /* SYSTEM_COHERENT_MEMORY. */
data |= BIT_ULL(3); /* VOL. */
data |= BIT_ULL(0); /* VALID. */
}
} else {
data |= (*map->pfn & NVKM_VMM_PFN_ADDR) >> 4;
data |= BIT_ULL(0); /* VALID. */
}
VMM_WO064(pt, vmm, ptei++ * 16, data);
}
nvkm_done(pt->memory);
}
static const struct nvkm_vmm_desc_func static const struct nvkm_vmm_desc_func
gp100_vmm_desc_pd0 = { gp100_vmm_desc_pd0 = {
.unmap = gp100_vmm_pd0_unmap, .unmap = gp100_vmm_pd0_unmap,
.sparse = gp100_vmm_pd0_sparse, .sparse = gp100_vmm_pd0_sparse,
.pde = gp100_vmm_pd0_pde, .pde = gp100_vmm_pd0_pde,
.mem = gp100_vmm_pd0_mem, .mem = gp100_vmm_pd0_mem,
.pfn = gp100_vmm_pd0_pfn,
.pfn_clear = gp100_vmm_pd0_pfn_clear,
.pfn_unmap = gp100_vmm_pd0_pfn_unmap,
}; };
static void static void
......
...@@ -37,16 +37,17 @@ ...@@ -37,16 +37,17 @@
* will fail. Must be combined with HMM_PFN_REQ_FAULT. * will fail. Must be combined with HMM_PFN_REQ_FAULT.
*/ */
enum hmm_pfn_flags { enum hmm_pfn_flags {
/* Output flags */ /* Output fields and flags */
HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1),
HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2),
HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8),
/* Input flags */ /* Input flags */
HMM_PFN_REQ_FAULT = HMM_PFN_VALID, HMM_PFN_REQ_FAULT = HMM_PFN_VALID,
HMM_PFN_REQ_WRITE = HMM_PFN_WRITE, HMM_PFN_REQ_WRITE = HMM_PFN_WRITE,
HMM_PFN_FLAGS = HMM_PFN_VALID | HMM_PFN_WRITE | HMM_PFN_ERROR, HMM_PFN_FLAGS = 0xFFUL << HMM_PFN_ORDER_SHIFT,
}; };
/* /*
...@@ -61,6 +62,25 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn) ...@@ -61,6 +62,25 @@ static inline struct page *hmm_pfn_to_page(unsigned long hmm_pfn)
return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS); return pfn_to_page(hmm_pfn & ~HMM_PFN_FLAGS);
} }
/*
* hmm_pfn_to_map_order() - return the CPU mapping size order
*
* This is optionally useful to optimize processing of the pfn result
* array. It indicates that the page starts at the order aligned VA and is
* 1<<order bytes long. Every pfn within an high order page will have the
* same pfn flags, both access protections and the map_order. The caller must
* be careful with edge cases as the start and end VA of the given page may
* extend past the range used with hmm_range_fault().
*
* This must be called under the caller 'user_lock' after a successful
* mmu_interval_read_begin(). The caller must have tested for HMM_PFN_VALID
* already.
*/
static inline unsigned int hmm_pfn_to_map_order(unsigned long hmm_pfn)
{
return (hmm_pfn >> HMM_PFN_ORDER_SHIFT) & 0x1F;
}
/* /*
* struct hmm_range - track invalidation lock on virtual address range * struct hmm_range - track invalidation lock on virtual address range
* *
......
...@@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn) ...@@ -180,6 +180,11 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID; return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
} }
enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
};
struct migrate_vma { struct migrate_vma {
struct vm_area_struct *vma; struct vm_area_struct *vma;
/* /*
...@@ -199,11 +204,14 @@ struct migrate_vma { ...@@ -199,11 +204,14 @@ struct migrate_vma {
/* /*
* Set to the owner value also stored in page->pgmap->owner for * Set to the owner value also stored in page->pgmap->owner for
* migrating out of device private memory. If set only device * migrating out of device private memory. The flags also need to
* private pages with this owner are migrated. If not set * be set to MIGRATE_VMA_SELECT_DEVICE_PRIVATE.
* device private pages are not migrated at all. * The caller should always set this field when using mmu notifier
* callbacks to avoid device MMU invalidations for device private
* pages that are not being migrated.
*/ */
void *src_owner; void *pgmap_owner;
unsigned long flags;
}; };
int migrate_vma_setup(struct migrate_vma *args); int migrate_vma_setup(struct migrate_vma *args);
......
...@@ -38,6 +38,10 @@ struct mmu_interval_notifier; ...@@ -38,6 +38,10 @@ struct mmu_interval_notifier;
* *
* @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
* that the mm refcount is zero and the range is no longer accessible. * that the mm refcount is zero and the range is no longer accessible.
*
* @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
* a device driver to possibly ignore the invalidation if the
* migrate_pgmap_owner field matches the driver's device private pgmap owner.
*/ */
enum mmu_notifier_event { enum mmu_notifier_event {
MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_UNMAP = 0,
...@@ -46,6 +50,7 @@ enum mmu_notifier_event { ...@@ -46,6 +50,7 @@ enum mmu_notifier_event {
MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_PROTECTION_PAGE,
MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_SOFT_DIRTY,
MMU_NOTIFY_RELEASE, MMU_NOTIFY_RELEASE,
MMU_NOTIFY_MIGRATE,
}; };
#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
...@@ -264,6 +269,7 @@ struct mmu_notifier_range { ...@@ -264,6 +269,7 @@ struct mmu_notifier_range {
unsigned long end; unsigned long end;
unsigned flags; unsigned flags;
enum mmu_notifier_event event; enum mmu_notifier_event event;
void *migrate_pgmap_owner;
}; };
static inline int mm_has_notifiers(struct mm_struct *mm) static inline int mm_has_notifiers(struct mm_struct *mm)
......
...@@ -214,6 +214,14 @@ static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, ...@@ -214,6 +214,14 @@ static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni,
{ {
struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); struct dmirror *dmirror = container_of(mni, struct dmirror, notifier);
/*
* Ignore invalidation callbacks for device private pages since
* the invalidation is handled as part of the migration process.
*/
if (range->event == MMU_NOTIFY_MIGRATE &&
range->migrate_pgmap_owner == dmirror->mdevice)
return true;
if (mmu_notifier_range_blockable(range)) if (mmu_notifier_range_blockable(range))
mutex_lock(&dmirror->mutex); mutex_lock(&dmirror->mutex);
else if (!mutex_trylock(&dmirror->mutex)) else if (!mutex_trylock(&dmirror->mutex))
...@@ -585,15 +593,6 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, ...@@ -585,15 +593,6 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args,
*/ */
spage = migrate_pfn_to_page(*src); spage = migrate_pfn_to_page(*src);
/*
* Don't migrate device private pages from our own driver or
* others. For our own we would do a device private memory copy
* not a migration and for others, we would need to fault the
* other device's page into system memory first.
*/
if (spage && is_zone_device_page(spage))
continue;
dpage = dmirror_devmem_alloc_page(mdevice); dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage) if (!dpage)
continue; continue;
...@@ -702,7 +701,8 @@ static int dmirror_migrate(struct dmirror *dmirror, ...@@ -702,7 +701,8 @@ static int dmirror_migrate(struct dmirror *dmirror,
args.dst = dst_pfns; args.dst = dst_pfns;
args.start = addr; args.start = addr;
args.end = next; args.end = next;
args.src_owner = NULL; args.pgmap_owner = dmirror->mdevice;
args.flags = MIGRATE_VMA_SELECT_SYSTEM;
ret = migrate_vma_setup(&args); ret = migrate_vma_setup(&args);
if (ret) if (ret)
goto out; goto out;
...@@ -766,6 +766,10 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, ...@@ -766,6 +766,10 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range,
*perm |= HMM_DMIRROR_PROT_WRITE; *perm |= HMM_DMIRROR_PROT_WRITE;
else else
*perm |= HMM_DMIRROR_PROT_READ; *perm |= HMM_DMIRROR_PROT_READ;
if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PMD_SHIFT)
*perm |= HMM_DMIRROR_PROT_PMD;
else if (hmm_pfn_to_map_order(entry) + PAGE_SHIFT == PUD_SHIFT)
*perm |= HMM_DMIRROR_PROT_PUD;
} }
static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni, static bool dmirror_snapshot_invalidate(struct mmu_interval_notifier *mni,
...@@ -987,7 +991,7 @@ static void dmirror_devmem_free(struct page *page) ...@@ -987,7 +991,7 @@ static void dmirror_devmem_free(struct page *page)
} }
static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
struct dmirror_device *mdevice) struct dmirror *dmirror)
{ {
const unsigned long *src = args->src; const unsigned long *src = args->src;
unsigned long *dst = args->dst; unsigned long *dst = args->dst;
...@@ -1009,6 +1013,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, ...@@ -1009,6 +1013,7 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
continue; continue;
lock_page(dpage); lock_page(dpage);
xa_erase(&dmirror->pt, addr >> PAGE_SHIFT);
copy_highpage(dpage, spage); copy_highpage(dpage, spage);
*dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
if (*src & MIGRATE_PFN_WRITE) if (*src & MIGRATE_PFN_WRITE)
...@@ -1017,15 +1022,6 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, ...@@ -1017,15 +1022,6 @@ static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args,
return 0; return 0;
} }
static void dmirror_devmem_fault_finalize_and_map(struct migrate_vma *args,
struct dmirror *dmirror)
{
/* Invalidate the device's page table mapping. */
mutex_lock(&dmirror->mutex);
dmirror_do_update(dmirror, args->start, args->end);
mutex_unlock(&dmirror->mutex);
}
static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
{ {
struct migrate_vma args; struct migrate_vma args;
...@@ -1049,16 +1045,21 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) ...@@ -1049,16 +1045,21 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf)
args.end = args.start + PAGE_SIZE; args.end = args.start + PAGE_SIZE;
args.src = &src_pfns; args.src = &src_pfns;
args.dst = &dst_pfns; args.dst = &dst_pfns;
args.src_owner = dmirror->mdevice; args.pgmap_owner = dmirror->mdevice;
args.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
if (migrate_vma_setup(&args)) if (migrate_vma_setup(&args))
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror->mdevice); ret = dmirror_devmem_fault_alloc_and_copy(&args, dmirror);
if (ret) if (ret)
return ret; return ret;
migrate_vma_pages(&args); migrate_vma_pages(&args);
dmirror_devmem_fault_finalize_and_map(&args, dmirror); /*
* No device finalize step is needed since
* dmirror_devmem_fault_alloc_and_copy() will have already
* invalidated the device page table.
*/
migrate_vma_finalize(&args); migrate_vma_finalize(&args);
return 0; return 0;
} }
......
...@@ -40,6 +40,8 @@ struct hmm_dmirror_cmd { ...@@ -40,6 +40,8 @@ struct hmm_dmirror_cmd {
* HMM_DMIRROR_PROT_NONE: unpopulated PTE or PTE with no access * HMM_DMIRROR_PROT_NONE: unpopulated PTE or PTE with no access
* HMM_DMIRROR_PROT_READ: read-only PTE * HMM_DMIRROR_PROT_READ: read-only PTE
* HMM_DMIRROR_PROT_WRITE: read/write PTE * HMM_DMIRROR_PROT_WRITE: read/write PTE
* HMM_DMIRROR_PROT_PMD: PMD sized page is fully mapped by same permissions
* HMM_DMIRROR_PROT_PUD: PUD sized page is fully mapped by same permissions
* HMM_DMIRROR_PROT_ZERO: special read-only zero page * HMM_DMIRROR_PROT_ZERO: special read-only zero page
* HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL: Migrated device private page on the * HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL: Migrated device private page on the
* device the ioctl() is made * device the ioctl() is made
...@@ -51,6 +53,8 @@ enum { ...@@ -51,6 +53,8 @@ enum {
HMM_DMIRROR_PROT_NONE = 0x00, HMM_DMIRROR_PROT_NONE = 0x00,
HMM_DMIRROR_PROT_READ = 0x01, HMM_DMIRROR_PROT_READ = 0x01,
HMM_DMIRROR_PROT_WRITE = 0x02, HMM_DMIRROR_PROT_WRITE = 0x02,
HMM_DMIRROR_PROT_PMD = 0x04,
HMM_DMIRROR_PROT_PUD = 0x08,
HMM_DMIRROR_PROT_ZERO = 0x10, HMM_DMIRROR_PROT_ZERO = 0x10,
HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20,
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
......
...@@ -165,12 +165,19 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, ...@@ -165,12 +165,19 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
return hmm_pfns_fill(addr, end, range, 0); return hmm_pfns_fill(addr, end, range, 0);
} }
static inline unsigned long hmm_pfn_flags_order(unsigned long order)
{
return order << HMM_PFN_ORDER_SHIFT;
}
static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range, static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
pmd_t pmd) pmd_t pmd)
{ {
if (pmd_protnone(pmd)) if (pmd_protnone(pmd))
return 0; return 0;
return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
HMM_PFN_VALID) |
hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
} }
#ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE
...@@ -389,7 +396,9 @@ static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range, ...@@ -389,7 +396,9 @@ static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
{ {
if (!pud_present(pud)) if (!pud_present(pud))
return 0; return 0;
return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID; return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
HMM_PFN_VALID) |
hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
} }
static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
...@@ -474,7 +483,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, ...@@ -474,7 +483,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
i = (start - range->start) >> PAGE_SHIFT; i = (start - range->start) >> PAGE_SHIFT;
pfn_req_flags = range->hmm_pfns[i]; pfn_req_flags = range->hmm_pfns[i];
cpu_flags = pte_to_hmm_pfn_flags(range, entry); cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
required_fault = required_fault =
hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags); hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) { if (required_fault) {
......
...@@ -2276,7 +2276,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -2276,7 +2276,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next; goto next;
page = device_private_entry_to_page(entry); page = device_private_entry_to_page(entry);
if (page->pgmap->owner != migrate->src_owner) if (!(migrate->flags &
MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
page->pgmap->owner != migrate->pgmap_owner)
goto next; goto next;
mpfn = migrate_pfn(page_to_pfn(page)) | mpfn = migrate_pfn(page_to_pfn(page)) |
...@@ -2284,7 +2286,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -2284,7 +2286,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_write_device_private_entry(entry)) if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE; mpfn |= MIGRATE_PFN_WRITE;
} else { } else {
if (migrate->src_owner) if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
goto next; goto next;
pfn = pte_pfn(pte); pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) { if (is_zero_pfn(pfn)) {
...@@ -2379,8 +2381,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) ...@@ -2379,8 +2381,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
{ {
struct mmu_notifier_range range; struct mmu_notifier_range range;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, /*
* Note that the pgmap_owner is passed to the mmu notifier callback so
* that the registered device driver can skip invalidating device
* private page mappings that won't be migrated.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_MIGRATE, 0, migrate->vma,
migrate->vma->vm_mm, migrate->start, migrate->end); migrate->vma->vm_mm, migrate->start, migrate->end);
range.migrate_pgmap_owner = migrate->pgmap_owner;
mmu_notifier_invalidate_range_start(&range); mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end, walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
......
...@@ -881,8 +881,9 @@ TEST_F(hmm, migrate) ...@@ -881,8 +881,9 @@ TEST_F(hmm, migrate)
} }
/* /*
* Migrate anonymous memory to device private memory and fault it back to system * Migrate anonymous memory to device private memory and fault some of it back
* memory. * to system memory, then try migrating the resulting mix of system and device
* private memory to the device.
*/ */
TEST_F(hmm, migrate_fault) TEST_F(hmm, migrate_fault)
{ {
...@@ -924,8 +925,17 @@ TEST_F(hmm, migrate_fault) ...@@ -924,8 +925,17 @@ TEST_F(hmm, migrate_fault)
for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
ASSERT_EQ(ptr[i], i); ASSERT_EQ(ptr[i], i);
/* Fault pages back to system memory and check them. */ /* Fault half the pages back to system memory and check them. */
for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
ASSERT_EQ(ptr[i], i);
/* Migrate memory to the device again. */
ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
/* Check what the device read. */
for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
ASSERT_EQ(ptr[i], i); ASSERT_EQ(ptr[i], i);
hmm_buffer_free(buffer); hmm_buffer_free(buffer);
...@@ -1291,6 +1301,82 @@ TEST_F(hmm2, snapshot) ...@@ -1291,6 +1301,82 @@ TEST_F(hmm2, snapshot)
hmm_buffer_free(buffer); hmm_buffer_free(buffer);
} }
/*
* Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
* should be mapped by a large page table entry.
*/
TEST_F(hmm, compound)
{
struct hmm_buffer *buffer;
unsigned long npages;
unsigned long size;
int *ptr;
unsigned char *m;
int ret;
long pagesizes[4];
int n, idx;
unsigned long i;
/* Skip test if we can't allocate a hugetlbfs page. */
n = gethugepagesizes(pagesizes, 4);
if (n <= 0)
return;
for (idx = 0; --n > 0; ) {
if (pagesizes[n] < pagesizes[idx])
idx = n;
}
size = ALIGN(TWOMEG, pagesizes[idx]);
npages = size >> self->page_shift;
buffer = malloc(sizeof(*buffer));
ASSERT_NE(buffer, NULL);
buffer->ptr = get_hugepage_region(size, GHR_STRICT);
if (buffer->ptr == NULL) {
free(buffer);
return;
}
buffer->size = size;
buffer->mirror = malloc(npages);
ASSERT_NE(buffer->mirror, NULL);
/* Initialize the pages the device will snapshot in buffer->ptr. */
for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
ptr[i] = i;
/* Simulate a device snapshotting CPU pagetables. */
ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
/* Check what the device saw. */
m = buffer->mirror;
for (i = 0; i < npages; ++i)
ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
HMM_DMIRROR_PROT_PMD);
/* Make the region read-only. */
ret = mprotect(buffer->ptr, size, PROT_READ);
ASSERT_EQ(ret, 0);
/* Simulate a device snapshotting CPU pagetables. */
ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
/* Check what the device saw. */
m = buffer->mirror;
for (i = 0; i < npages; ++i)
ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
HMM_DMIRROR_PROT_PMD);
free_hugepage_region(buffer->ptr);
buffer->ptr = NULL;
hmm_buffer_free(buffer);
}
/* /*
* Test two devices reading the same memory (double mapped). * Test two devices reading the same memory (double mapped).
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment