Commit fec88ab0 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull HMM updates from Jason Gunthorpe:
 "Improvements and bug fixes for the hmm interface in the kernel:

   - Improve clarity, locking and APIs related to the 'hmm mirror'
     feature merged last cycle. In linux-next we now see AMDGPU and
     nouveau to be using this API.

   - Remove old or transitional hmm APIs. These are hold overs from the
     past with no users, or APIs that existed only to manage cross tree
     conflicts. There are still a few more of these cleanups that didn't
     make the merge window cut off.

   - Improve some core mm APIs:
       - export alloc_pages_vma() for driver use
       - refactor into devm_request_free_mem_region() to manage
         DEVICE_PRIVATE resource reservations
       - refactor duplicative driver code into the core dev_pagemap
         struct

   - Remove hmm wrappers of improved core mm APIs, instead have drivers
     use the simplified API directly

   - Remove DEVICE_PUBLIC

   - Simplify the kconfig flow for the hmm users and core code"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (42 commits)
  mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
  mm: remove the HMM config option
  mm: sort out the DEVICE_PRIVATE Kconfig mess
  mm: simplify ZONE_DEVICE page private data
  mm: remove hmm_devmem_add
  mm: remove hmm_vma_alloc_locked_page
  nouveau: use devm_memremap_pages directly
  nouveau: use alloc_page_vma directly
  PCI/P2PDMA: use the dev_pagemap internal refcount
  device-dax: use the dev_pagemap internal refcount
  memremap: provide an optional internal refcount in struct dev_pagemap
  memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag
  memremap: remove the data field in struct dev_pagemap
  memremap: add a migrate_to_ram method to struct dev_pagemap_ops
  memremap: lift the devmap_enable manipulation into devm_memremap_pages
  memremap: pass a struct dev_pagemap to ->kill and ->cleanup
  memremap: move dev_pagemap callbacks into a separate structure
  memremap: validate the pagemap type passed to devm_memremap_pages
  mm: factor out a devm_request_free_mem_region helper
  mm: export alloc_pages_vma
  ...
parents fa6e951a cc5dfd59
This diff is collapsed.
......@@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page;
struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
int ret;
/*
* If we have an altmap then we need to skip over any reserved PFNs
* when querying the zone.
*/
page = pfn_to_page(start_pfn);
if (altmap)
page += vmem_altmap_offset(altmap);
__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
/* Remove htab bolted mappings for this section of memory */
......
......@@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn);
struct zone *zone;
struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
struct zone *zone = page_zone(page);
/* With altmap the first mapped page is offset from @start */
if (altmap)
page += vmem_altmap_offset(altmap);
zone = page_zone(page);
__remove_pages(zone, start_pfn, nr_pages, altmap);
kernel_physical_mapping_remove(start, start + size);
}
......
......@@ -43,8 +43,6 @@ struct dax_region {
* @target_node: effective numa node if dev_dax memory range is onlined
* @dev - device core
* @pgmap - pgmap for memmap setup / lifetime (driver owned)
* @ref: pgmap reference count (driver owned)
* @cmp: @ref final put completion (driver owned)
*/
struct dev_dax {
struct dax_region *region;
......@@ -52,8 +50,6 @@ struct dev_dax {
int target_node;
struct device dev;
struct dev_pagemap pgmap;
struct percpu_ref ref;
struct completion cmp;
};
static inline struct dev_dax *to_dev_dax(struct device *dev)
......
......@@ -14,37 +14,6 @@
#include "dax-private.h"
#include "bus.h"
static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref)
{
return container_of(ref, struct dev_dax, ref);
}
static void dev_dax_percpu_release(struct percpu_ref *ref)
{
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
dev_dbg(&dev_dax->dev, "%s\n", __func__);
complete(&dev_dax->cmp);
}
static void dev_dax_percpu_exit(struct percpu_ref *ref)
{
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
dev_dbg(&dev_dax->dev, "%s\n", __func__);
wait_for_completion(&dev_dax->cmp);
percpu_ref_exit(ref);
}
static void dev_dax_percpu_kill(struct percpu_ref *data)
{
struct percpu_ref *ref = data;
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
dev_dbg(&dev_dax->dev, "%s\n", __func__);
percpu_ref_kill(ref);
}
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func)
{
......@@ -459,15 +428,7 @@ int dev_dax_probe(struct device *dev)
return -EBUSY;
}
init_completion(&dev_dax->cmp);
rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0,
GFP_KERNEL);
if (rc)
return rc;
dev_dax->pgmap.ref = &dev_dax->ref;
dev_dax->pgmap.kill = dev_dax_percpu_kill;
dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
addr = devm_memremap_pages(dev, &dev_dax->pgmap);
if (IS_ERR(addr))
return PTR_ERR(addr);
......
......@@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
struct dev_dax *dev_dax;
struct nd_namespace_io *nsio;
struct dax_region *dax_region;
struct dev_pagemap pgmap = { 0 };
struct dev_pagemap pgmap = { };
struct nd_namespace_common *ndns;
struct nd_dax *nd_dax = to_nd_dax(dev);
struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
......
......@@ -84,11 +84,11 @@ config DRM_NOUVEAU_BACKLIGHT
config DRM_NOUVEAU_SVM
bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
depends on ARCH_HAS_HMM
depends on DEVICE_PRIVATE
depends on DRM_NOUVEAU
depends on HMM_MIRROR
depends on STAGING
select HMM_MIRROR
select DEVICE_PRIVATE
select MIGRATE_VMA_HELPER
default n
help
Say Y here if you want to enable experimental support for
......
......@@ -72,7 +72,8 @@ struct nouveau_dmem_migrate {
};
struct nouveau_dmem {
struct hmm_devmem *devmem;
struct nouveau_drm *drm;
struct dev_pagemap pagemap;
struct nouveau_dmem_migrate migrate;
struct list_head chunk_free;
struct list_head chunk_full;
......@@ -80,6 +81,11 @@ struct nouveau_dmem {
struct mutex mutex;
};
static inline struct nouveau_dmem *page_to_dmem(struct page *page)
{
return container_of(page->pgmap, struct nouveau_dmem, pagemap);
}
struct nouveau_dmem_fault {
struct nouveau_drm *drm;
struct nouveau_fence *fence;
......@@ -96,14 +102,10 @@ struct nouveau_migrate {
unsigned long dma_nr;
};
static void
nouveau_dmem_free(struct hmm_devmem *devmem, struct page *page)
static void nouveau_dmem_page_free(struct page *page)
{
struct nouveau_dmem_chunk *chunk;
unsigned long idx;
chunk = (void *)hmm_devmem_page_get_drvdata(page);
idx = page_to_pfn(page) - chunk->pfn_first;
struct nouveau_dmem_chunk *chunk = page->zone_device_data;
unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
/*
* FIXME:
......@@ -148,11 +150,12 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue;
dpage = hmm_vma_alloc_locked_page(vma, addr);
dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr);
if (!dpage) {
dst_pfns[i] = MIGRATE_PFN_ERROR;
continue;
}
lock_page(dpage);
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
MIGRATE_PFN_LOCKED;
......@@ -194,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
dst_addr = fault->dma[fault->npages++];
chunk = (void *)hmm_devmem_page_get_drvdata(spage);
chunk = spage->zone_device_data;
src_addr = page_to_pfn(spage) - chunk->pfn_first;
src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
......@@ -259,29 +262,21 @@ static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = {
.finalize_and_map = nouveau_dmem_fault_finalize_and_map,
};
static vm_fault_t
nouveau_dmem_fault(struct hmm_devmem *devmem,
struct vm_area_struct *vma,
unsigned long addr,
const struct page *page,
unsigned int flags,
pmd_t *pmdp)
static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
{
struct drm_device *drm_dev = dev_get_drvdata(devmem->device);
struct nouveau_dmem *dmem = page_to_dmem(vmf->page);
unsigned long src[1] = {0}, dst[1] = {0};
struct nouveau_dmem_fault fault = {0};
struct nouveau_dmem_fault fault = { .drm = dmem->drm };
int ret;
/*
* FIXME what we really want is to find some heuristic to migrate more
* than just one page on CPU fault. When such fault happens it is very
* likely that more surrounding page will CPU fault too.
*/
fault.drm = nouveau_drm(drm_dev);
ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vma, addr,
addr + PAGE_SIZE, src, dst, &fault);
ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma,
vmf->address, vmf->address + PAGE_SIZE,
src, dst, &fault);
if (ret)
return VM_FAULT_SIGBUS;
......@@ -291,10 +286,9 @@ nouveau_dmem_fault(struct hmm_devmem *devmem,
return 0;
}
static const struct hmm_devmem_ops
nouveau_dmem_devmem_ops = {
.free = nouveau_dmem_free,
.fault = nouveau_dmem_fault,
static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = {
.page_free = nouveau_dmem_page_free,
.migrate_to_ram = nouveau_dmem_migrate_to_ram,
};
static int
......@@ -580,7 +574,8 @@ void
nouveau_dmem_init(struct nouveau_drm *drm)
{
struct device *device = drm->dev->dev;
unsigned long i, size;
struct resource *res;
unsigned long i, size, pfn_first;
int ret;
/* This only make sense on PASCAL or newer */
......@@ -590,6 +585,7 @@ nouveau_dmem_init(struct nouveau_drm *drm)
if (!(drm->dmem = kzalloc(sizeof(*drm->dmem), GFP_KERNEL)))
return;
drm->dmem->drm = drm;
mutex_init(&drm->dmem->mutex);
INIT_LIST_HEAD(&drm->dmem->chunk_free);
INIT_LIST_HEAD(&drm->dmem->chunk_full);
......@@ -599,11 +595,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
/* Initialize migration dma helpers before registering memory */
ret = nouveau_dmem_migrate_init(drm);
if (ret) {
kfree(drm->dmem);
drm->dmem = NULL;
return;
}
if (ret)
goto out_free;
/*
* FIXME we need some kind of policy to decide how much VRAM we
......@@ -611,14 +604,16 @@ nouveau_dmem_init(struct nouveau_drm *drm)
* and latter if we want to do thing like over commit then we
* could revisit this.
*/
drm->dmem->devmem = hmm_devmem_add(&nouveau_dmem_devmem_ops,
device, size);
if (IS_ERR(drm->dmem->devmem)) {
kfree(drm->dmem);
drm->dmem = NULL;
return;
}
res = devm_request_free_mem_region(device, &iomem_resource, size);
if (IS_ERR(res))
goto out_free;
drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
drm->dmem->pagemap.res = *res;
drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops;
if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap)))
goto out_free;
pfn_first = res->start >> PAGE_SHIFT;
for (i = 0; i < (size / DMEM_CHUNK_SIZE); ++i) {
struct nouveau_dmem_chunk *chunk;
struct page *page;
......@@ -631,17 +626,19 @@ nouveau_dmem_init(struct nouveau_drm *drm)
}
chunk->drm = drm;
chunk->pfn_first = drm->dmem->devmem->pfn_first;
chunk->pfn_first += (i * DMEM_CHUNK_NPAGES);
chunk->pfn_first = pfn_first + (i * DMEM_CHUNK_NPAGES);
list_add_tail(&chunk->list, &drm->dmem->chunk_empty);
page = pfn_to_page(chunk->pfn_first);
for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) {
hmm_devmem_page_set_drvdata(page, (long)chunk);
}
for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page)
page->zone_device_data = chunk;
}
NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20);
return;
out_free:
kfree(drm->dmem);
drm->dmem = NULL;
}
static void
......@@ -697,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
continue;
chunk = (void *)hmm_devmem_page_get_drvdata(dpage);
chunk = dpage->zone_device_data;
dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
......@@ -832,13 +829,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
static inline bool
nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
{
if (!is_device_private_page(page))
return false;
if (drm->dmem->devmem != page->pgmap->data)
return false;
return true;
return is_device_private_page(page) && drm->dmem == page_to_dmem(page);
}
void
......@@ -867,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
continue;
}
chunk = (void *)hmm_devmem_page_get_drvdata(page);
chunk = page->zone_device_data;
addr = page_to_pfn(page) - chunk->pfn_first;
addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
......
......@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
range.values = nouveau_svm_pfn_values;
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
again:
ret = hmm_vma_fault(&range, true);
ret = hmm_vma_fault(&svmm->mirror, &range, true);
if (ret == 0) {
mutex_lock(&svmm->mutex);
if (!hmm_vma_range_done(&range)) {
......
......@@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
if (offset < reserve)
return -EINVAL;
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
pgmap->altmap_valid = false;
} else if (nd_pfn->mode == PFN_MODE_PMEM) {
nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
- offset) / PAGE_SIZE);
......@@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
memcpy(altmap, &__altmap, sizeof(*altmap));
altmap->free = PHYS_PFN(offset - reserve);
altmap->alloc = 0;
pgmap->altmap_valid = true;
pgmap->flags |= PGMAP_ALTMAP_VALID;
} else
return -ENXIO;
......
......@@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = {
NULL,
};
static void __pmem_release_queue(struct percpu_ref *ref)
static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
{
struct request_queue *q;
struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
q = container_of(ref, typeof(*q), q_usage_counter);
blk_cleanup_queue(q);
}
static void pmem_release_queue(void *ref)
static void pmem_release_queue(void *pgmap)
{
__pmem_release_queue(ref);
pmem_pagemap_cleanup(pgmap);
}
static void pmem_freeze_queue(struct percpu_ref *ref)
static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
{
struct request_queue *q;
struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
q = container_of(ref, typeof(*q), q_usage_counter);
blk_freeze_queue_start(q);
}
......@@ -334,26 +334,16 @@ static void pmem_release_disk(void *__pmem)
put_disk(pmem->disk);
}
static void pmem_release_pgmap_ops(void *__pgmap)
{
dev_pagemap_put_ops();
}
static void fsdax_pagefree(struct page *page, void *data)
static void pmem_pagemap_page_free(struct page *page)
{
wake_up_var(&page->_refcount);
}
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
{
dev_pagemap_get_ops();
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
return -ENOMEM;
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;
return 0;
}
static const struct dev_pagemap_ops fsdax_pagemap_ops = {
.page_free = pmem_pagemap_page_free,
.kill = pmem_pagemap_kill,
.cleanup = pmem_pagemap_cleanup,
};
static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
......@@ -409,11 +399,9 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter;
pmem->pgmap.kill = pmem_freeze_queue;
pmem->pgmap.cleanup = __pmem_release_queue;
if (is_nd_pfn(dev)) {
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
......@@ -424,15 +412,14 @@ static int pmem_attach_disk(struct device *dev,
bb_res.start += pmem->data_offset;
} else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false;
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
} else {
if (devm_add_action_or_reset(dev, pmem_release_queue,
&q->q_usage_counter))
&pmem->pgmap))
return -ENOMEM;
addr = devm_memremap(dev, pmem->phys_addr,
pmem->size, ARCH_MEMREMAP_PMEM);
......
......@@ -25,12 +25,6 @@ struct pci_p2pdma {
bool p2pmem_published;
};
struct p2pdma_pagemap {
struct dev_pagemap pgmap;
struct percpu_ref ref;
struct completion ref_done;
};
static ssize_t size_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
......@@ -79,31 +73,6 @@ static const struct attribute_group p2pmem_group = {
.name = "p2pmem",
};
static struct p2pdma_pagemap *to_p2p_pgmap(struct percpu_ref *ref)
{
return container_of(ref, struct p2pdma_pagemap, ref);
}
static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
{
struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
complete(&p2p_pgmap->ref_done);
}
static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
{
percpu_ref_kill(ref);
}
static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
{
struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
wait_for_completion(&p2p_pgmap->ref_done);
percpu_ref_exit(&p2p_pgmap->ref);
}
static void pci_p2pdma_release(void *data)
{
struct pci_dev *pdev = data;
......@@ -166,7 +135,6 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset)
{
struct p2pdma_pagemap *p2p_pgmap;
struct dev_pagemap *pgmap;
void *addr;
int error;
......@@ -189,27 +157,15 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
return error;
}
p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL);
if (!p2p_pgmap)
pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
if (!pgmap)
return -ENOMEM;
init_completion(&p2p_pgmap->ref_done);
error = percpu_ref_init(&p2p_pgmap->ref,
pci_p2pdma_percpu_release, 0, GFP_KERNEL);
if (error)
goto pgmap_free;
pgmap = &p2p_pgmap->pgmap;
pgmap->res.start = pci_resource_start(pdev, bar) + offset;
pgmap->res.end = pgmap->res.start + size - 1;
pgmap->res.flags = pci_resource_flags(pdev, bar);
pgmap->ref = &p2p_pgmap->ref;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
pci_resource_start(pdev, bar);
pgmap->kill = pci_p2pdma_percpu_kill;
pgmap->cleanup = pci_p2pdma_percpu_cleanup;
addr = devm_memremap_pages(&pdev->dev, pgmap);
if (IS_ERR(addr)) {
......@@ -220,7 +176,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
pci_bus_address(pdev, bar) + offset,
resource_size(&pgmap->res), dev_to_node(&pdev->dev),
&p2p_pgmap->ref);
pgmap->ref);
if (error)
goto pages_free;
......@@ -232,7 +188,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
pages_free:
devm_memunmap_pages(&pdev->dev, pgmap);
pgmap_free:
devm_kfree(&pdev->dev, p2p_pgmap);
devm_kfree(&pdev->dev, pgmap);
return error;
}
EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
......
......@@ -1322,7 +1322,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
page = _vm_normal_page(vma, addr, pte, true);
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
......
This diff is collapsed.
......@@ -133,8 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY = 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
IORES_DESC_RESERVED = 8,
IORES_DESC_RESERVED = 7,
};
/*
......@@ -296,6 +295,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
return (r1->start <= r2->end && r1->end >= r2->start);
}
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size);
#endif /* __ASSEMBLY__ */
#endif /* _LINUX_IOPORT_H */
......@@ -37,13 +37,6 @@ struct vmem_altmap {
* A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.rst.
*
* MEMORY_DEVICE_PUBLIC:
* Device memory that is cache coherent from device and CPU point of view. This
* is use on platform that have an advance system bus (like CAPI or CCIX). A
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
* type. Any page of a process can be migrated to such memory. However no one
* should be allow to pin such memory so that it can always be evicted.
*
* MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page
......@@ -52,54 +45,84 @@ struct vmem_altmap {
* wakeup is used to coordinate physical address space management (ex:
* fs truncate/hole punch) vs pinned pages (ex: device dma).
*
* MEMORY_DEVICE_DEVDAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In contrast to
* MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax
* character device.
*
* MEMORY_DEVICE_PCI_P2PDMA:
* Device memory residing in a PCI BAR intended for use with Peer-to-Peer
* transactions.
*/
enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
MEMORY_DEVICE_PUBLIC,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_DEVDAX,
MEMORY_DEVICE_PCI_P2PDMA,
};
/*
* Additional notes about MEMORY_DEVICE_PRIVATE may be found in
* include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief
* explanation in include/linux/memory_hotplug.h.
*
* The page_free() callback is called once the page refcount reaches 1
* (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
* This allows the device driver to implement its own memory management.)
*/
typedef void (*dev_page_free_t)(struct page *page, void *data);
struct dev_pagemap_ops {
/*
* Called once the page refcount reaches 1. (ZONE_DEVICE pages never
* reach 0 refcount unless there is a refcount bug. This allows the
* device driver to implement its own memory management.)
*/
void (*page_free)(struct page *page);
/*
* Transition the refcount in struct dev_pagemap to the dead state.
*/
void (*kill)(struct dev_pagemap *pgmap);
/*
* Wait for refcount in struct dev_pagemap to be idle and reap it.
*/
void (*cleanup)(struct dev_pagemap *pgmap);
/*
* Used for private (un-addressable) device memory only. Must migrate
* the page back to a CPU accessible page.
*/
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
};
#define PGMAP_ALTMAP_VALID (1 << 0)
/**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @page_free: free page callback when page refcount reaches 1
* @altmap: pre-allocated/reserved memory for vmemmap allocations
* @res: physical address range covered by @ref
* @ref: reference count that pins the devm_memremap_pages() mapping
* @kill: callback to transition @ref to the dead state
* @cleanup: callback to wait for @ref to be idle and reap it
* @internal_ref: internal reference if @ref is not provided by the caller
* @done: completion for @internal_ref
* @dev: host device of the mapping for debug
* @data: private data pointer for page_free()
* @type: memory type: see MEMORY_* in memory_hotplug.h
* @flags: PGMAP_* flags to specify defailed behavior
* @ops: method table
*/
struct dev_pagemap {
dev_page_free_t page_free;
struct vmem_altmap altmap;
bool altmap_valid;
struct resource res;
struct percpu_ref *ref;
void (*kill)(struct percpu_ref *ref);
void (*cleanup)(struct percpu_ref *ref);
struct percpu_ref internal_ref;
struct completion done;
struct device *dev;
void *data;
enum memory_type type;
unsigned int flags;
u64 pci_p2pdma_bus_offset;
const struct dev_pagemap_ops *ops;
};
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
if (pgmap->flags & PGMAP_ALTMAP_VALID)
return &pgmap->altmap;
return NULL;
}
#ifdef CONFIG_ZONE_DEVICE
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
......
......@@ -937,8 +937,6 @@ static inline bool is_zone_device_page(const struct page *page)
#endif
#ifdef CONFIG_DEV_PAGEMAP_OPS
void dev_pagemap_get_ops(void);
void dev_pagemap_put_ops(void);
void __put_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
static inline bool put_devmap_managed_page(struct page *page)
......@@ -949,7 +947,6 @@ static inline bool put_devmap_managed_page(struct page *page)
return false;
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_PUBLIC:
case MEMORY_DEVICE_FS_DAX:
__put_devmap_managed_page(page);
return true;
......@@ -965,12 +962,6 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
}
static inline bool is_device_public_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
}
#ifdef CONFIG_PCI_P2PDMA
static inline bool is_pci_p2pdma_page(const struct page *page)
{
......@@ -985,14 +976,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
#endif /* CONFIG_PCI_P2PDMA */
#else /* CONFIG_DEV_PAGEMAP_OPS */
static inline void dev_pagemap_get_ops(void)
{
}
static inline void dev_pagemap_put_ops(void)
{
}
static inline bool put_devmap_managed_page(struct page *page)
{
return false;
......@@ -1003,11 +986,6 @@ static inline bool is_device_private_page(const struct page *page)
return false;
}
static inline bool is_device_public_page(const struct page *page)
{
return false;
}
static inline bool is_pci_p2pdma_page(const struct page *page)
{
return false;
......@@ -1436,10 +1414,8 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */
};
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device);
#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd);
......
......@@ -158,7 +158,7 @@ struct page {
struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap;
unsigned long hmm_data;
void *zone_device_data;
unsigned long _zd_pad_1; /* uses mapping */
};
......@@ -503,7 +503,7 @@ struct mm_struct {
#endif
struct work_struct async_put_work;
#if IS_ENABLED(CONFIG_HMM)
#ifdef CONFIG_HMM_MIRROR
/* HMM needs to track a few things per mm */
struct hmm *hmm;
#endif
......
......@@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{
return pfn_to_page(swp_offset(entry));
}
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
swp_entry_t entry,
unsigned int flags,
pmd_t *pmdp);
#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
{
......@@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{
return NULL;
}
static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
swp_entry_t entry,
unsigned int flags,
pmd_t *pmdp)
{
return VM_FAULT_SIGBUS;
}
#endif /* CONFIG_DEVICE_PRIVATE */
#ifdef CONFIG_MIGRATION
......
......@@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
......
......@@ -11,41 +11,39 @@
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/xarray.h>
#include <linux/hmm.h>
static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
swp_entry_t entry,
unsigned int flags,
pmd_t *pmdp)
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
static atomic_t devmap_managed_enable;
static void devmap_managed_enable_put(void *data)
{
struct page *page = device_private_entry_to_page(entry);
struct hmm_devmem *devmem;
if (atomic_dec_and_test(&devmap_managed_enable))
static_branch_disable(&devmap_managed_key);
}
devmem = container_of(page->pgmap, typeof(*devmem), pagemap);
static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
{
if (!pgmap->ops || !pgmap->ops->page_free) {
WARN(1, "Missing page_free method\n");
return -EINVAL;
}
/*
* The page_fault() callback must migrate page back to system memory
* so that CPU can access it. This might fail for various reasons
* (device issue, device was unsafely unplugged, ...). When such
* error conditions happen, the callback must return VM_FAULT_SIGBUS.
*
* Note that because memory cgroup charges are accounted to the device
* memory, this should never fail because of memory restrictions (but
* allocation of regular system page might still fail because we are
* out of memory).
*
* There is a more in-depth description of what that callback can and
* cannot do, in include/linux/memremap.h
*/
return devmem->page_fault(vma, addr, page, flags, pmdp);
if (atomic_inc_return(&devmap_managed_enable) == 1)
static_branch_enable(&devmap_managed_key);
return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
}
#endif /* CONFIG_DEVICE_PRIVATE */
#else
static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
{
return -EINVAL;
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
static void pgmap_array_delete(struct resource *res)
{
......@@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
static unsigned long pfn_first(struct dev_pagemap *pgmap)
{
const struct resource *res = &pgmap->res;
struct vmem_altmap *altmap = &pgmap->altmap;
unsigned long pfn;
pfn = res->start >> PAGE_SHIFT;
if (pgmap->altmap_valid)
pfn += vmem_altmap_offset(altmap);
return pfn;
return (pgmap->res.start >> PAGE_SHIFT) +
vmem_altmap_offset(pgmap_altmap(pgmap));
}
static unsigned long pfn_end(struct dev_pagemap *pgmap)
......@@ -83,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn)
#define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
{
if (pgmap->ops && pgmap->ops->kill)
pgmap->ops->kill(pgmap);
else
percpu_ref_kill(pgmap->ref);
}
static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
{
if (pgmap->ops && pgmap->ops->cleanup) {
pgmap->ops->cleanup(pgmap);
} else {
wait_for_completion(&pgmap->done);
percpu_ref_exit(pgmap->ref);
}
}
static void devm_memremap_pages_release(void *data)
{
struct dev_pagemap *pgmap = data;
......@@ -92,10 +102,10 @@ static void devm_memremap_pages_release(void *data)
unsigned long pfn;
int nid;
pgmap->kill(pgmap->ref);
dev_pagemap_kill(pgmap);
for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn));
pgmap->cleanup(pgmap->ref);
dev_pagemap_cleanup(pgmap);
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
......@@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data)
align_size >> PAGE_SHIFT, NULL);
} else {
arch_remove_memory(nid, align_start, align_size,
pgmap->altmap_valid ? &pgmap->altmap : NULL);
pgmap_altmap(pgmap));
kasan_remove_zero_shadow(__va(align_start), align_size);
}
mem_hotplug_done();
......@@ -122,20 +132,29 @@ static void devm_memremap_pages_release(void *data)
"%s: failed to free all reserved pages\n", __func__);
}
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
struct dev_pagemap *pgmap =
container_of(ref, struct dev_pagemap, internal_ref);
complete(&pgmap->done);
}
/**
* devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res
* @pgmap: pointer to a struct dev_pagemap
*
* Notes:
* 1/ At a minimum the res, ref and type members of @pgmap must be initialized
* 1/ At a minimum the res and type members of @pgmap must be initialized
* by the caller before passing it to this function
*
* 2/ The altmap field may optionally be initialized, in which case altmap_valid
* must be set to true
* 2/ The altmap field may optionally be initialized, in which case
* PGMAP_ALTMAP_VALID must be set in pgmap->flags.
*
* 3/ pgmap->ref must be 'live' on entry and will be killed and reaped
* at devm_memremap_pages_release() time, or if this routine fails.
* 3/ The ref field may optionally be provided, in which pgmap->ref must be
* 'live' on entry and will be killed and reaped at
* devm_memremap_pages_release() time, or if this routine fails.
*
* 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
......@@ -144,22 +163,66 @@ static void devm_memremap_pages_release(void *data)
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
resource_size_t align_start, align_size, align_end;
struct vmem_altmap *altmap = pgmap->altmap_valid ?
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap;
struct mhp_restrictions restrictions = {
/*
* We do not want any optional features only our own memmap
*/
.altmap = altmap,
.altmap = pgmap_altmap(pgmap),
};
pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram;
bool need_devmap_managed = true;
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
WARN(1, "Device private memory not supported\n");
return ERR_PTR(-EINVAL);
}
if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_FS_DAX:
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_DEVDAX:
case MEMORY_DEVICE_PCI_P2PDMA:
need_devmap_managed = false;
break;
default:
WARN(1, "Invalid pgmap type %d\n", pgmap->type);
break;
}
if (!pgmap->ref) {
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
return ERR_PTR(-EINVAL);
init_completion(&pgmap->done);
error = percpu_ref_init(&pgmap->internal_ref,
dev_pagemap_percpu_release, 0, GFP_KERNEL);
if (error)
return ERR_PTR(error);
pgmap->ref = &pgmap->internal_ref;
} else {
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
}
}
if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) {
WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
if (need_devmap_managed) {
error = devmap_managed_enable_get(dev, pgmap);
if (error)
return ERR_PTR(error);
}
align_start = res->start & ~(SECTION_SIZE - 1);
......@@ -241,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
align_size >> PAGE_SHIFT, altmap);
align_size >> PAGE_SHIFT, pgmap_altmap(pgmap));
}
mem_hotplug_done();
......@@ -271,9 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_pfn_remap:
pgmap_array_delete(res);
err_array:
pgmap->kill(pgmap->ref);
pgmap->cleanup(pgmap->ref);
dev_pagemap_kill(pgmap);
dev_pagemap_cleanup(pgmap);
return ERR_PTR(error);
}
EXPORT_SYMBOL_GPL(devm_memremap_pages);
......@@ -287,7 +349,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
/* number of pfns from base where pfn_to_page() is valid */
return altmap->reserve + altmap->free;
if (altmap)
return altmap->reserve + altmap->free;
return 0;
}
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
......@@ -329,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
static atomic_t devmap_enable;
/*
* Toggle the static key for ->page_free() callbacks when dev_pagemap
* pages go idle.
*/
void dev_pagemap_get_ops(void)
{
if (atomic_inc_return(&devmap_enable) == 1)
static_branch_enable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
void dev_pagemap_put_ops(void)
{
if (atomic_dec_and_test(&devmap_enable))
static_branch_disable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
void __put_devmap_managed_page(struct page *page)
{
int count = page_ref_dec_return(page);
......@@ -366,7 +408,7 @@ void __put_devmap_managed_page(struct page *page)
mem_cgroup_uncharge(page);
page->pgmap->page_free(page, page->pgmap->data);
page->pgmap->ops->page_free(page);
} else if (!count)
__put_page(page);
}
......
......@@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head)
}
EXPORT_SYMBOL(resource_list_free);
#ifdef CONFIG_DEVICE_PRIVATE
/**
* devm_request_free_mem_region - find free region for device private memory
*
* @dev: device struct to bind the resource to
* @size: size in bytes of the device memory to add
* @base: resource tree to look in
*
* This function tries to find an empty range of physical address big enough to
* contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
* memory, which in turn allocates struct pages.
*/
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size)
{
resource_size_t end, addr;
struct resource *res;
size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
addr = end - size + 1UL;
for (; addr > size && addr >= base->start; addr -= size) {
if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
REGION_DISJOINT)
continue;
res = devm_request_mem_region(dev, addr, size, dev_name(dev));
if (!res)
return ERR_PTR(-ENOMEM);
res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
return res;
}
return ERR_PTR(-ERANGE);
}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
#endif /* CONFIG_DEVICE_PRIVATE */
static int __init strict_iomem(char *str)
{
if (strstr(str, "relaxed"))
......
......@@ -670,47 +670,17 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y.
config ARCH_HAS_HMM_MIRROR
bool
default y
depends on (X86_64 || PPC64)
depends on MMU && 64BIT
config ARCH_HAS_HMM_DEVICE
bool
default y
depends on (X86_64 || PPC64)
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
depends on ARCH_HAS_ZONE_DEVICE
select XARRAY_MULTI
config ARCH_HAS_HMM
bool
default y
depends on (X86_64 || PPC64)
depends on ZONE_DEVICE
depends on MMU && 64BIT
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
config MIGRATE_VMA_HELPER
bool
config DEV_PAGEMAP_OPS
bool
config HMM
bool
select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM
select HMM
depends on (X86_64 || PPC64)
depends on MMU && 64BIT
select MMU_NOTIFIER
help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a
process into a device page table. Here, mirror means "keep synchronized".
......@@ -720,8 +690,7 @@ config HMM_MIRROR
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
depends on ARCH_HAS_HMM
select HMM
depends on ZONE_DEVICE
select DEV_PAGEMAP_OPS
help
......@@ -729,17 +698,6 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM
select HMM
select DEV_PAGEMAP_OPS
help
Allows creation of struct pages to represent addressable device
memory; i.e., memory that is accessible from both the device and
the CPU
config FRAME_VECTOR
bool
......
......@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
......@@ -609,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
/*
* This should never happen (a device public page in the gate
* area).
*/
if (is_device_public_page(*page))
goto unmap;
}
if (unlikely(!try_get_page(*page))) {
ret = -ENOMEM;
......
This diff is collapsed.
......@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue;
}
page = _vm_normal_page(vma, addr, ptent, true);
page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
......
......@@ -4908,7 +4908,7 @@ enum mc_target_type {
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
struct page *page = _vm_normal_page(vma, addr, ptent, true);
struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
......@@ -5109,8 +5109,8 @@ static int mem_cgroup_move_account(struct page *page,
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
* or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
* (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
......@@ -5144,8 +5144,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page) ||
is_device_public_page(page))
if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
......@@ -5216,8 +5215,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
* support transparent huge page with MEMORY_DEVICE_PUBLIC or
* MEMORY_DEVICE_PRIVATE but this might change.
* support transparent huge page with MEMORY_DEVICE_PRIVATE but
* this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
......
......@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto unlock;
}
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_PUBLIC:
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
/*
* TODO: Handle HMM pages which may need coordination
* with device-side memory.
*/
goto unlock;
default:
break;
}
/*
......
......@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* PFNMAP mappings in order to support COWable mappings.
*
*/
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device)
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
......@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (is_zero_pfn(pfn))
return NULL;
/*
* Device public pages are special pages (they are ZONE_DEVICE
* pages but different from persistent memory). They behave
* allmost like normal pages. The difference is that they are
* not on the lru and thus should never be involve with any-
* thing that involve lru manipulation (mlock, numa balancing,
* ...).
*
* This is why we still want to return NULL for such page from
* vm_normal_page() so that we do not have to special case all
* call site of vm_normal_page().
*/
if (likely(pfn <= highest_memmap_pfn)) {
struct page *page = pfn_to_page(pfn);
if (is_device_public_page(page)) {
if (with_public_device)
return page;
return NULL;
}
}
if (pte_devmap(pte))
return NULL;
......@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
rss[mm_counter(page)]++;
} else if (pte_devmap(pte)) {
page = pte_page(pte);
/*
* Cache coherent device memory behave like regular page and
* not like persistent memory page. For more informations see
* MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
*/
if (is_device_public_page(page)) {
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
}
out_set_pte:
......@@ -1063,7 +1029,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (pte_present(ptent)) {
struct page *page;
page = _vm_normal_page(vma, addr, ptent, true);
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
......@@ -2777,13 +2743,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
} else if (is_device_private_entry(entry)) {
/*
* For un-addressable device memory we call the pgmap
* fault handler callback. The callback must migrate
* the page back to some CPU accessible page.
*/
ret = device_private_entry_fault(vma, vmf->address, entry,
vmf->flags, vmf->pmd);
vmf->page = device_private_entry_to_page(entry);
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
......
......@@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
int sections_to_remove;
/* In the ZONE_DEVICE case device driver owns the memory region */
if (is_dev_zone(zone)) {
if (altmap)
map_offset = vmem_altmap_offset(altmap);
}
if (is_dev_zone(zone))
map_offset = vmem_altmap_offset(altmap);
clear_zone_contiguous(zone);
......
......@@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
out:
return page;
}
EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages_current - Allocate pages.
......
......@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
} else if (is_device_public_page(new)) {
pte = pte_mkdevmap(pte);
}
}
......@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
expected_count += is_device_public_page(page);
if (mapping)
expected_count += hpage_nr_pages(page) + page_has_private(page);
......@@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
if (!PageMappingFlags(page))
page->mapping = NULL;
if (unlikely(is_zone_device_page(newpage))) {
if (is_device_public_page(newpage))
flush_dcache_page(newpage);
} else
if (likely(!is_zone_device_page(newpage)))
flush_dcache_page(newpage);
}
......@@ -2265,7 +2259,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pfn = 0;
goto next;
}
page = _vm_normal_page(migrate->vma, addr, pte, true);
page = vm_normal_page(migrate->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
......@@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page)
* FIXME proper solution is to rework migration_entry_wait() so
* it does not need to take a reference on page.
*/
if (is_device_private_page(page))
return true;
/*
* Only allow device public page to be migrated and account for
* the extra reference count imply by ZONE_DEVICE pages.
*/
if (!is_device_public_page(page))
return false;
extra++;
return is_device_private_page(page);
}
/* For file back page */
......@@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
} else if (is_device_public_page(page)) {
entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
entry = pte_mkdevmap(entry);
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
......@@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
} else if (!is_device_public_page(newpage)) {
} else {
/*
* Other types of ZONE_DEVICE page are not
* supported.
......
......@@ -5925,6 +5925,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
{
unsigned long pfn, end_pfn = start_pfn + size;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
int nid = pgdat->node_id;
......@@ -5937,9 +5938,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* of the pages reserved for the memmap, so we can just jump to
* the end of that region and start processing the device pages.
*/
if (pgmap->altmap_valid) {
struct vmem_altmap *altmap = &pgmap->altmap;
if (altmap) {
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
size = end_pfn - start_pfn;
}
......@@ -5959,12 +5958,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
__SetPageReserved(page);
/*
* ZONE_DEVICE pages union ->lru with a ->pgmap back
* pointer and hmm_data. It is a bug if a ZONE_DEVICE
* page is ever freed or placed on a driver-private list.
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
page->pgmap = pgmap;
page->hmm_data = 0;
page->zone_device_data = NULL;
/*
* Mark the block movable so that blocks are reserved for
......
......@@ -740,15 +740,20 @@ void release_pages(struct page **pages, int nr)
if (is_huge_zero_page(page))
continue;
/* Device public page can not be huge page */
if (is_device_public_page(page)) {
if (is_zone_device_page(page)) {
if (locked_pgdat) {
spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags);
locked_pgdat = NULL;
}
put_devmap_managed_page(page);
continue;
/*
* ZONE_DEVICE pages that return 'false' from
* put_devmap_managed_page() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
if (put_devmap_managed_page(page))
continue;
}
page = compound_head(page);
......
......@@ -100,25 +100,60 @@ static void nfit_test_kill(void *_pgmap)
{
struct dev_pagemap *pgmap = _pgmap;
WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup);
pgmap->kill(pgmap->ref);
pgmap->cleanup(pgmap->ref);
WARN_ON(!pgmap || !pgmap->ref);
if (pgmap->ops && pgmap->ops->kill)
pgmap->ops->kill(pgmap);
else
percpu_ref_kill(pgmap->ref);
if (pgmap->ops && pgmap->ops->cleanup) {
pgmap->ops->cleanup(pgmap);
} else {
wait_for_completion(&pgmap->done);
percpu_ref_exit(pgmap->ref);
}
}
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
struct dev_pagemap *pgmap =
container_of(ref, struct dev_pagemap, internal_ref);
complete(&pgmap->done);
}
void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{
int error;
resource_size_t offset = pgmap->res.start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res) {
int rc;
rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
if (rc)
return ERR_PTR(rc);
return nfit_res->buf + offset - nfit_res->res.start;
if (!nfit_res)
return devm_memremap_pages(dev, pgmap);
pgmap->dev = dev;
if (!pgmap->ref) {
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
return ERR_PTR(-EINVAL);
init_completion(&pgmap->done);
error = percpu_ref_init(&pgmap->internal_ref,
dev_pagemap_percpu_release, 0, GFP_KERNEL);
if (error)
return ERR_PTR(error);
pgmap->ref = &pgmap->internal_ref;
} else {
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
}
}
return devm_memremap_pages(dev, pgmap);
error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
if (error)
return ERR_PTR(error);
return nfit_res->buf + offset - nfit_res->res.start;
}
EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment