Commit fec88ab0 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull HMM updates from Jason Gunthorpe:
 "Improvements and bug fixes for the hmm interface in the kernel:

   - Improve clarity, locking and APIs related to the 'hmm mirror'
     feature merged last cycle. In linux-next we now see AMDGPU and
     nouveau to be using this API.

   - Remove old or transitional hmm APIs. These are hold overs from the
     past with no users, or APIs that existed only to manage cross tree
     conflicts. There are still a few more of these cleanups that didn't
     make the merge window cut off.

   - Improve some core mm APIs:
       - export alloc_pages_vma() for driver use
       - refactor into devm_request_free_mem_region() to manage
         DEVICE_PRIVATE resource reservations
       - refactor duplicative driver code into the core dev_pagemap
         struct

   - Remove hmm wrappers of improved core mm APIs, instead have drivers
     use the simplified API directly

   - Remove DEVICE_PUBLIC

   - Simplify the kconfig flow for the hmm users and core code"

* tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (42 commits)
  mm: don't select MIGRATE_VMA_HELPER from HMM_MIRROR
  mm: remove the HMM config option
  mm: sort out the DEVICE_PRIVATE Kconfig mess
  mm: simplify ZONE_DEVICE page private data
  mm: remove hmm_devmem_add
  mm: remove hmm_vma_alloc_locked_page
  nouveau: use devm_memremap_pages directly
  nouveau: use alloc_page_vma directly
  PCI/P2PDMA: use the dev_pagemap internal refcount
  device-dax: use the dev_pagemap internal refcount
  memremap: provide an optional internal refcount in struct dev_pagemap
  memremap: replace the altmap_valid field with a PGMAP_ALTMAP_VALID flag
  memremap: remove the data field in struct dev_pagemap
  memremap: add a migrate_to_ram method to struct dev_pagemap_ops
  memremap: lift the devmap_enable manipulation into devm_memremap_pages
  memremap: pass a struct dev_pagemap to ->kill and ->cleanup
  memremap: move dev_pagemap callbacks into a separate structure
  memremap: validate the pagemap type passed to devm_memremap_pages
  mm: factor out a devm_request_free_mem_region helper
  mm: export alloc_pages_vma
  ...
parents fa6e951a cc5dfd59
This diff is collapsed.
...@@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, ...@@ -131,17 +131,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
{ {
unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page; struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
int ret; int ret;
/*
* If we have an altmap then we need to skip over any reserved PFNs
* when querying the zone.
*/
page = pfn_to_page(start_pfn);
if (altmap)
page += vmem_altmap_offset(altmap);
__remove_pages(page_zone(page), start_pfn, nr_pages, altmap); __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
/* Remove htab bolted mappings for this section of memory */ /* Remove htab bolted mappings for this section of memory */
......
...@@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size, ...@@ -1213,13 +1213,9 @@ void __ref arch_remove_memory(int nid, u64 start, u64 size,
{ {
unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn); struct page *page = pfn_to_page(start_pfn) + vmem_altmap_offset(altmap);
struct zone *zone; struct zone *zone = page_zone(page);
/* With altmap the first mapped page is offset from @start */
if (altmap)
page += vmem_altmap_offset(altmap);
zone = page_zone(page);
__remove_pages(zone, start_pfn, nr_pages, altmap); __remove_pages(zone, start_pfn, nr_pages, altmap);
kernel_physical_mapping_remove(start, start + size); kernel_physical_mapping_remove(start, start + size);
} }
......
...@@ -43,8 +43,6 @@ struct dax_region { ...@@ -43,8 +43,6 @@ struct dax_region {
* @target_node: effective numa node if dev_dax memory range is onlined * @target_node: effective numa node if dev_dax memory range is onlined
* @dev - device core * @dev - device core
* @pgmap - pgmap for memmap setup / lifetime (driver owned) * @pgmap - pgmap for memmap setup / lifetime (driver owned)
* @ref: pgmap reference count (driver owned)
* @cmp: @ref final put completion (driver owned)
*/ */
struct dev_dax { struct dev_dax {
struct dax_region *region; struct dax_region *region;
...@@ -52,8 +50,6 @@ struct dev_dax { ...@@ -52,8 +50,6 @@ struct dev_dax {
int target_node; int target_node;
struct device dev; struct device dev;
struct dev_pagemap pgmap; struct dev_pagemap pgmap;
struct percpu_ref ref;
struct completion cmp;
}; };
static inline struct dev_dax *to_dev_dax(struct device *dev) static inline struct dev_dax *to_dev_dax(struct device *dev)
......
...@@ -14,37 +14,6 @@ ...@@ -14,37 +14,6 @@
#include "dax-private.h" #include "dax-private.h"
#include "bus.h" #include "bus.h"
static struct dev_dax *ref_to_dev_dax(struct percpu_ref *ref)
{
return container_of(ref, struct dev_dax, ref);
}
static void dev_dax_percpu_release(struct percpu_ref *ref)
{
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
dev_dbg(&dev_dax->dev, "%s\n", __func__);
complete(&dev_dax->cmp);
}
static void dev_dax_percpu_exit(struct percpu_ref *ref)
{
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
dev_dbg(&dev_dax->dev, "%s\n", __func__);
wait_for_completion(&dev_dax->cmp);
percpu_ref_exit(ref);
}
static void dev_dax_percpu_kill(struct percpu_ref *data)
{
struct percpu_ref *ref = data;
struct dev_dax *dev_dax = ref_to_dev_dax(ref);
dev_dbg(&dev_dax->dev, "%s\n", __func__);
percpu_ref_kill(ref);
}
static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma, static int check_vma(struct dev_dax *dev_dax, struct vm_area_struct *vma,
const char *func) const char *func)
{ {
...@@ -459,15 +428,7 @@ int dev_dax_probe(struct device *dev) ...@@ -459,15 +428,7 @@ int dev_dax_probe(struct device *dev)
return -EBUSY; return -EBUSY;
} }
init_completion(&dev_dax->cmp); dev_dax->pgmap.type = MEMORY_DEVICE_DEVDAX;
rc = percpu_ref_init(&dev_dax->ref, dev_dax_percpu_release, 0,
GFP_KERNEL);
if (rc)
return rc;
dev_dax->pgmap.ref = &dev_dax->ref;
dev_dax->pgmap.kill = dev_dax_percpu_kill;
dev_dax->pgmap.cleanup = dev_dax_percpu_exit;
addr = devm_memremap_pages(dev, &dev_dax->pgmap); addr = devm_memremap_pages(dev, &dev_dax->pgmap);
if (IS_ERR(addr)) if (IS_ERR(addr))
return PTR_ERR(addr); return PTR_ERR(addr);
......
...@@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys) ...@@ -16,7 +16,7 @@ struct dev_dax *__dax_pmem_probe(struct device *dev, enum dev_dax_subsys subsys)
struct dev_dax *dev_dax; struct dev_dax *dev_dax;
struct nd_namespace_io *nsio; struct nd_namespace_io *nsio;
struct dax_region *dax_region; struct dax_region *dax_region;
struct dev_pagemap pgmap = { 0 }; struct dev_pagemap pgmap = { };
struct nd_namespace_common *ndns; struct nd_namespace_common *ndns;
struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_dax *nd_dax = to_nd_dax(dev);
struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; struct nd_pfn *nd_pfn = &nd_dax->nd_pfn;
......
...@@ -84,11 +84,11 @@ config DRM_NOUVEAU_BACKLIGHT ...@@ -84,11 +84,11 @@ config DRM_NOUVEAU_BACKLIGHT
config DRM_NOUVEAU_SVM config DRM_NOUVEAU_SVM
bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
depends on ARCH_HAS_HMM depends on DEVICE_PRIVATE
depends on DRM_NOUVEAU depends on DRM_NOUVEAU
depends on HMM_MIRROR
depends on STAGING depends on STAGING
select HMM_MIRROR select MIGRATE_VMA_HELPER
select DEVICE_PRIVATE
default n default n
help help
Say Y here if you want to enable experimental support for Say Y here if you want to enable experimental support for
......
...@@ -72,7 +72,8 @@ struct nouveau_dmem_migrate { ...@@ -72,7 +72,8 @@ struct nouveau_dmem_migrate {
}; };
struct nouveau_dmem { struct nouveau_dmem {
struct hmm_devmem *devmem; struct nouveau_drm *drm;
struct dev_pagemap pagemap;
struct nouveau_dmem_migrate migrate; struct nouveau_dmem_migrate migrate;
struct list_head chunk_free; struct list_head chunk_free;
struct list_head chunk_full; struct list_head chunk_full;
...@@ -80,6 +81,11 @@ struct nouveau_dmem { ...@@ -80,6 +81,11 @@ struct nouveau_dmem {
struct mutex mutex; struct mutex mutex;
}; };
static inline struct nouveau_dmem *page_to_dmem(struct page *page)
{
return container_of(page->pgmap, struct nouveau_dmem, pagemap);
}
struct nouveau_dmem_fault { struct nouveau_dmem_fault {
struct nouveau_drm *drm; struct nouveau_drm *drm;
struct nouveau_fence *fence; struct nouveau_fence *fence;
...@@ -96,14 +102,10 @@ struct nouveau_migrate { ...@@ -96,14 +102,10 @@ struct nouveau_migrate {
unsigned long dma_nr; unsigned long dma_nr;
}; };
static void static void nouveau_dmem_page_free(struct page *page)
nouveau_dmem_free(struct hmm_devmem *devmem, struct page *page)
{ {
struct nouveau_dmem_chunk *chunk; struct nouveau_dmem_chunk *chunk = page->zone_device_data;
unsigned long idx; unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
chunk = (void *)hmm_devmem_page_get_drvdata(page);
idx = page_to_pfn(page) - chunk->pfn_first;
/* /*
* FIXME: * FIXME:
...@@ -148,11 +150,12 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, ...@@ -148,11 +150,12 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
continue; continue;
dpage = hmm_vma_alloc_locked_page(vma, addr); dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr);
if (!dpage) { if (!dpage) {
dst_pfns[i] = MIGRATE_PFN_ERROR; dst_pfns[i] = MIGRATE_PFN_ERROR;
continue; continue;
} }
lock_page(dpage);
dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
MIGRATE_PFN_LOCKED; MIGRATE_PFN_LOCKED;
...@@ -194,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, ...@@ -194,7 +197,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
dst_addr = fault->dma[fault->npages++]; dst_addr = fault->dma[fault->npages++];
chunk = (void *)hmm_devmem_page_get_drvdata(spage); chunk = spage->zone_device_data;
src_addr = page_to_pfn(spage) - chunk->pfn_first; src_addr = page_to_pfn(spage) - chunk->pfn_first;
src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset; src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
...@@ -259,29 +262,21 @@ static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = { ...@@ -259,29 +262,21 @@ static const struct migrate_vma_ops nouveau_dmem_fault_migrate_ops = {
.finalize_and_map = nouveau_dmem_fault_finalize_and_map, .finalize_and_map = nouveau_dmem_fault_finalize_and_map,
}; };
static vm_fault_t static vm_fault_t nouveau_dmem_migrate_to_ram(struct vm_fault *vmf)
nouveau_dmem_fault(struct hmm_devmem *devmem,
struct vm_area_struct *vma,
unsigned long addr,
const struct page *page,
unsigned int flags,
pmd_t *pmdp)
{ {
struct drm_device *drm_dev = dev_get_drvdata(devmem->device); struct nouveau_dmem *dmem = page_to_dmem(vmf->page);
unsigned long src[1] = {0}, dst[1] = {0}; unsigned long src[1] = {0}, dst[1] = {0};
struct nouveau_dmem_fault fault = {0}; struct nouveau_dmem_fault fault = { .drm = dmem->drm };
int ret; int ret;
/* /*
* FIXME what we really want is to find some heuristic to migrate more * FIXME what we really want is to find some heuristic to migrate more
* than just one page on CPU fault. When such fault happens it is very * than just one page on CPU fault. When such fault happens it is very
* likely that more surrounding page will CPU fault too. * likely that more surrounding page will CPU fault too.
*/ */
fault.drm = nouveau_drm(drm_dev); ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vmf->vma,
ret = migrate_vma(&nouveau_dmem_fault_migrate_ops, vma, addr, vmf->address, vmf->address + PAGE_SIZE,
addr + PAGE_SIZE, src, dst, &fault); src, dst, &fault);
if (ret) if (ret)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
...@@ -291,10 +286,9 @@ nouveau_dmem_fault(struct hmm_devmem *devmem, ...@@ -291,10 +286,9 @@ nouveau_dmem_fault(struct hmm_devmem *devmem,
return 0; return 0;
} }
static const struct hmm_devmem_ops static const struct dev_pagemap_ops nouveau_dmem_pagemap_ops = {
nouveau_dmem_devmem_ops = { .page_free = nouveau_dmem_page_free,
.free = nouveau_dmem_free, .migrate_to_ram = nouveau_dmem_migrate_to_ram,
.fault = nouveau_dmem_fault,
}; };
static int static int
...@@ -580,7 +574,8 @@ void ...@@ -580,7 +574,8 @@ void
nouveau_dmem_init(struct nouveau_drm *drm) nouveau_dmem_init(struct nouveau_drm *drm)
{ {
struct device *device = drm->dev->dev; struct device *device = drm->dev->dev;
unsigned long i, size; struct resource *res;
unsigned long i, size, pfn_first;
int ret; int ret;
/* This only make sense on PASCAL or newer */ /* This only make sense on PASCAL or newer */
...@@ -590,6 +585,7 @@ nouveau_dmem_init(struct nouveau_drm *drm) ...@@ -590,6 +585,7 @@ nouveau_dmem_init(struct nouveau_drm *drm)
if (!(drm->dmem = kzalloc(sizeof(*drm->dmem), GFP_KERNEL))) if (!(drm->dmem = kzalloc(sizeof(*drm->dmem), GFP_KERNEL)))
return; return;
drm->dmem->drm = drm;
mutex_init(&drm->dmem->mutex); mutex_init(&drm->dmem->mutex);
INIT_LIST_HEAD(&drm->dmem->chunk_free); INIT_LIST_HEAD(&drm->dmem->chunk_free);
INIT_LIST_HEAD(&drm->dmem->chunk_full); INIT_LIST_HEAD(&drm->dmem->chunk_full);
...@@ -599,11 +595,8 @@ nouveau_dmem_init(struct nouveau_drm *drm) ...@@ -599,11 +595,8 @@ nouveau_dmem_init(struct nouveau_drm *drm)
/* Initialize migration dma helpers before registering memory */ /* Initialize migration dma helpers before registering memory */
ret = nouveau_dmem_migrate_init(drm); ret = nouveau_dmem_migrate_init(drm);
if (ret) { if (ret)
kfree(drm->dmem); goto out_free;
drm->dmem = NULL;
return;
}
/* /*
* FIXME we need some kind of policy to decide how much VRAM we * FIXME we need some kind of policy to decide how much VRAM we
...@@ -611,14 +604,16 @@ nouveau_dmem_init(struct nouveau_drm *drm) ...@@ -611,14 +604,16 @@ nouveau_dmem_init(struct nouveau_drm *drm)
* and latter if we want to do thing like over commit then we * and latter if we want to do thing like over commit then we
* could revisit this. * could revisit this.
*/ */
drm->dmem->devmem = hmm_devmem_add(&nouveau_dmem_devmem_ops, res = devm_request_free_mem_region(device, &iomem_resource, size);
device, size); if (IS_ERR(res))
if (IS_ERR(drm->dmem->devmem)) { goto out_free;
kfree(drm->dmem); drm->dmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
drm->dmem = NULL; drm->dmem->pagemap.res = *res;
return; drm->dmem->pagemap.ops = &nouveau_dmem_pagemap_ops;
} if (IS_ERR(devm_memremap_pages(device, &drm->dmem->pagemap)))
goto out_free;
pfn_first = res->start >> PAGE_SHIFT;
for (i = 0; i < (size / DMEM_CHUNK_SIZE); ++i) { for (i = 0; i < (size / DMEM_CHUNK_SIZE); ++i) {
struct nouveau_dmem_chunk *chunk; struct nouveau_dmem_chunk *chunk;
struct page *page; struct page *page;
...@@ -631,17 +626,19 @@ nouveau_dmem_init(struct nouveau_drm *drm) ...@@ -631,17 +626,19 @@ nouveau_dmem_init(struct nouveau_drm *drm)
} }
chunk->drm = drm; chunk->drm = drm;
chunk->pfn_first = drm->dmem->devmem->pfn_first; chunk->pfn_first = pfn_first + (i * DMEM_CHUNK_NPAGES);
chunk->pfn_first += (i * DMEM_CHUNK_NPAGES);
list_add_tail(&chunk->list, &drm->dmem->chunk_empty); list_add_tail(&chunk->list, &drm->dmem->chunk_empty);
page = pfn_to_page(chunk->pfn_first); page = pfn_to_page(chunk->pfn_first);
for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page) { for (j = 0; j < DMEM_CHUNK_NPAGES; ++j, ++page)
hmm_devmem_page_set_drvdata(page, (long)chunk); page->zone_device_data = chunk;
}
} }
NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20); NV_INFO(drm, "DMEM: registered %ldMB of device memory\n", size >> 20);
return;
out_free:
kfree(drm->dmem);
drm->dmem = NULL;
} }
static void static void
...@@ -697,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, ...@@ -697,7 +694,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
continue; continue;
chunk = (void *)hmm_devmem_page_get_drvdata(dpage); chunk = dpage->zone_device_data;
dst_addr = page_to_pfn(dpage) - chunk->pfn_first; dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset; dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
...@@ -832,13 +829,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, ...@@ -832,13 +829,7 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
static inline bool static inline bool
nouveau_dmem_page(struct nouveau_drm *drm, struct page *page) nouveau_dmem_page(struct nouveau_drm *drm, struct page *page)
{ {
if (!is_device_private_page(page)) return is_device_private_page(page) && drm->dmem == page_to_dmem(page);
return false;
if (drm->dmem->devmem != page->pgmap->data)
return false;
return true;
} }
void void
...@@ -867,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, ...@@ -867,7 +858,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
continue; continue;
} }
chunk = (void *)hmm_devmem_page_get_drvdata(page); chunk = page->zone_device_data;
addr = page_to_pfn(page) - chunk->pfn_first; addr = page_to_pfn(page) - chunk->pfn_first;
addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT; addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
......
...@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) ...@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
range.values = nouveau_svm_pfn_values; range.values = nouveau_svm_pfn_values;
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
again: again:
ret = hmm_vma_fault(&range, true); ret = hmm_vma_fault(&svmm->mirror, &range, true);
if (ret == 0) { if (ret == 0) {
mutex_lock(&svmm->mutex); mutex_lock(&svmm->mutex);
if (!hmm_vma_range_done(&range)) { if (!hmm_vma_range_done(&range)) {
......
...@@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) ...@@ -622,7 +622,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
if (offset < reserve) if (offset < reserve)
return -EINVAL; return -EINVAL;
nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
pgmap->altmap_valid = false;
} else if (nd_pfn->mode == PFN_MODE_PMEM) { } else if (nd_pfn->mode == PFN_MODE_PMEM) {
nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res)
- offset) / PAGE_SIZE); - offset) / PAGE_SIZE);
...@@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) ...@@ -634,7 +633,7 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
memcpy(altmap, &__altmap, sizeof(*altmap)); memcpy(altmap, &__altmap, sizeof(*altmap));
altmap->free = PHYS_PFN(offset - reserve); altmap->free = PHYS_PFN(offset - reserve);
altmap->alloc = 0; altmap->alloc = 0;
pgmap->altmap_valid = true; pgmap->flags |= PGMAP_ALTMAP_VALID;
} else } else
return -ENXIO; return -ENXIO;
......
...@@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = { ...@@ -303,24 +303,24 @@ static const struct attribute_group *pmem_attribute_groups[] = {
NULL, NULL,
}; };
static void __pmem_release_queue(struct percpu_ref *ref) static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
{ {
struct request_queue *q; struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
q = container_of(ref, typeof(*q), q_usage_counter);
blk_cleanup_queue(q); blk_cleanup_queue(q);
} }
static void pmem_release_queue(void *ref) static void pmem_release_queue(void *pgmap)
{ {
__pmem_release_queue(ref); pmem_pagemap_cleanup(pgmap);
} }
static void pmem_freeze_queue(struct percpu_ref *ref) static void pmem_pagemap_kill(struct dev_pagemap *pgmap)
{ {
struct request_queue *q; struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
q = container_of(ref, typeof(*q), q_usage_counter);
blk_freeze_queue_start(q); blk_freeze_queue_start(q);
} }
...@@ -334,26 +334,16 @@ static void pmem_release_disk(void *__pmem) ...@@ -334,26 +334,16 @@ static void pmem_release_disk(void *__pmem)
put_disk(pmem->disk); put_disk(pmem->disk);
} }
static void pmem_release_pgmap_ops(void *__pgmap) static void pmem_pagemap_page_free(struct page *page)
{
dev_pagemap_put_ops();
}
static void fsdax_pagefree(struct page *page, void *data)
{ {
wake_up_var(&page->_refcount); wake_up_var(&page->_refcount);
} }
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap) static const struct dev_pagemap_ops fsdax_pagemap_ops = {
{ .page_free = pmem_pagemap_page_free,
dev_pagemap_get_ops(); .kill = pmem_pagemap_kill,
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap)) .cleanup = pmem_pagemap_cleanup,
return -ENOMEM; };
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;
return 0;
}
static int pmem_attach_disk(struct device *dev, static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns) struct nd_namespace_common *ndns)
...@@ -409,11 +399,9 @@ static int pmem_attach_disk(struct device *dev, ...@@ -409,11 +399,9 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV; pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter; pmem->pgmap.ref = &q->q_usage_counter;
pmem->pgmap.kill = pmem_freeze_queue;
pmem->pgmap.cleanup = __pmem_release_queue;
if (is_nd_pfn(dev)) { if (is_nd_pfn(dev)) {
if (setup_pagemap_fsdax(dev, &pmem->pgmap)) pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
return -ENOMEM; pmem->pgmap.ops = &fsdax_pagemap_ops;
addr = devm_memremap_pages(dev, &pmem->pgmap); addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb; pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
...@@ -424,15 +412,14 @@ static int pmem_attach_disk(struct device *dev, ...@@ -424,15 +412,14 @@ static int pmem_attach_disk(struct device *dev,
bb_res.start += pmem->data_offset; bb_res.start += pmem->data_offset;
} else if (pmem_should_map_pages(dev)) { } else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false; pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
if (setup_pagemap_fsdax(dev, &pmem->pgmap)) pmem->pgmap.ops = &fsdax_pagemap_ops;
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap); addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP; pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
} else { } else {
if (devm_add_action_or_reset(dev, pmem_release_queue, if (devm_add_action_or_reset(dev, pmem_release_queue,
&q->q_usage_counter)) &pmem->pgmap))
return -ENOMEM; return -ENOMEM;
addr = devm_memremap(dev, pmem->phys_addr, addr = devm_memremap(dev, pmem->phys_addr,
pmem->size, ARCH_MEMREMAP_PMEM); pmem->size, ARCH_MEMREMAP_PMEM);
......
...@@ -25,12 +25,6 @@ struct pci_p2pdma { ...@@ -25,12 +25,6 @@ struct pci_p2pdma {
bool p2pmem_published; bool p2pmem_published;
}; };
struct p2pdma_pagemap {
struct dev_pagemap pgmap;
struct percpu_ref ref;
struct completion ref_done;
};
static ssize_t size_show(struct device *dev, struct device_attribute *attr, static ssize_t size_show(struct device *dev, struct device_attribute *attr,
char *buf) char *buf)
{ {
...@@ -79,31 +73,6 @@ static const struct attribute_group p2pmem_group = { ...@@ -79,31 +73,6 @@ static const struct attribute_group p2pmem_group = {
.name = "p2pmem", .name = "p2pmem",
}; };
static struct p2pdma_pagemap *to_p2p_pgmap(struct percpu_ref *ref)
{
return container_of(ref, struct p2pdma_pagemap, ref);
}
static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
{
struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
complete(&p2p_pgmap->ref_done);
}
static void pci_p2pdma_percpu_kill(struct percpu_ref *ref)
{
percpu_ref_kill(ref);
}
static void pci_p2pdma_percpu_cleanup(struct percpu_ref *ref)
{
struct p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(ref);
wait_for_completion(&p2p_pgmap->ref_done);
percpu_ref_exit(&p2p_pgmap->ref);
}
static void pci_p2pdma_release(void *data) static void pci_p2pdma_release(void *data)
{ {
struct pci_dev *pdev = data; struct pci_dev *pdev = data;
...@@ -166,7 +135,6 @@ static int pci_p2pdma_setup(struct pci_dev *pdev) ...@@ -166,7 +135,6 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset) u64 offset)
{ {
struct p2pdma_pagemap *p2p_pgmap;
struct dev_pagemap *pgmap; struct dev_pagemap *pgmap;
void *addr; void *addr;
int error; int error;
...@@ -189,27 +157,15 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, ...@@ -189,27 +157,15 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
return error; return error;
} }
p2p_pgmap = devm_kzalloc(&pdev->dev, sizeof(*p2p_pgmap), GFP_KERNEL); pgmap = devm_kzalloc(&pdev->dev, sizeof(*pgmap), GFP_KERNEL);
if (!p2p_pgmap) if (!pgmap)
return -ENOMEM; return -ENOMEM;
init_completion(&p2p_pgmap->ref_done);
error = percpu_ref_init(&p2p_pgmap->ref,
pci_p2pdma_percpu_release, 0, GFP_KERNEL);
if (error)
goto pgmap_free;
pgmap = &p2p_pgmap->pgmap;
pgmap->res.start = pci_resource_start(pdev, bar) + offset; pgmap->res.start = pci_resource_start(pdev, bar) + offset;
pgmap->res.end = pgmap->res.start + size - 1; pgmap->res.end = pgmap->res.start + size - 1;
pgmap->res.flags = pci_resource_flags(pdev, bar); pgmap->res.flags = pci_resource_flags(pdev, bar);
pgmap->ref = &p2p_pgmap->ref;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA; pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) - pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
pci_resource_start(pdev, bar); pci_resource_start(pdev, bar);
pgmap->kill = pci_p2pdma_percpu_kill;
pgmap->cleanup = pci_p2pdma_percpu_cleanup;
addr = devm_memremap_pages(&pdev->dev, pgmap); addr = devm_memremap_pages(&pdev->dev, pgmap);
if (IS_ERR(addr)) { if (IS_ERR(addr)) {
...@@ -220,7 +176,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, ...@@ -220,7 +176,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr, error = gen_pool_add_owner(pdev->p2pdma->pool, (unsigned long)addr,
pci_bus_address(pdev, bar) + offset, pci_bus_address(pdev, bar) + offset,
resource_size(&pgmap->res), dev_to_node(&pdev->dev), resource_size(&pgmap->res), dev_to_node(&pdev->dev),
&p2p_pgmap->ref); pgmap->ref);
if (error) if (error)
goto pages_free; goto pages_free;
...@@ -232,7 +188,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, ...@@ -232,7 +188,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
pages_free: pages_free:
devm_memunmap_pages(&pdev->dev, pgmap); devm_memunmap_pages(&pdev->dev, pgmap);
pgmap_free: pgmap_free:
devm_kfree(&pdev->dev, p2p_pgmap); devm_kfree(&pdev->dev, pgmap);
return error; return error;
} }
EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource); EXPORT_SYMBOL_GPL(pci_p2pdma_add_resource);
......
...@@ -1322,7 +1322,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, ...@@ -1322,7 +1322,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
if (pm->show_pfn) if (pm->show_pfn)
frame = pte_pfn(pte); frame = pte_pfn(pte);
flags |= PM_PRESENT; flags |= PM_PRESENT;
page = _vm_normal_page(vma, addr, pte, true); page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte)) if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY; flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) { } else if (is_swap_pte(pte)) {
......
This diff is collapsed.
...@@ -133,8 +133,7 @@ enum { ...@@ -133,8 +133,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY = 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6,
IORES_DESC_DEVICE_PUBLIC_MEMORY = 7, IORES_DESC_RESERVED = 7,
IORES_DESC_RESERVED = 8,
}; };
/* /*
...@@ -296,6 +295,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) ...@@ -296,6 +295,8 @@ static inline bool resource_overlaps(struct resource *r1, struct resource *r2)
return (r1->start <= r2->end && r1->end >= r2->start); return (r1->start <= r2->end && r1->end >= r2->start);
} }
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size);
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#endif /* _LINUX_IOPORT_H */ #endif /* _LINUX_IOPORT_H */
...@@ -37,13 +37,6 @@ struct vmem_altmap { ...@@ -37,13 +37,6 @@ struct vmem_altmap {
* A more complete discussion of unaddressable memory may be found in * A more complete discussion of unaddressable memory may be found in
* include/linux/hmm.h and Documentation/vm/hmm.rst. * include/linux/hmm.h and Documentation/vm/hmm.rst.
* *
* MEMORY_DEVICE_PUBLIC:
* Device memory that is cache coherent from device and CPU point of view. This
* is use on platform that have an advance system bus (like CAPI or CCIX). A
* driver can hotplug the device memory using ZONE_DEVICE and with that memory
* type. Any page of a process can be migrated to such memory. However no one
* should be allow to pin such memory so that it can always be evicted.
*
* MEMORY_DEVICE_FS_DAX: * MEMORY_DEVICE_FS_DAX:
* Host memory that has similar access semantics as System RAM i.e. DMA * Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In support of coordinating page * coherent and supports page pinning. In support of coordinating page
...@@ -52,54 +45,84 @@ struct vmem_altmap { ...@@ -52,54 +45,84 @@ struct vmem_altmap {
* wakeup is used to coordinate physical address space management (ex: * wakeup is used to coordinate physical address space management (ex:
* fs truncate/hole punch) vs pinned pages (ex: device dma). * fs truncate/hole punch) vs pinned pages (ex: device dma).
* *
* MEMORY_DEVICE_DEVDAX:
* Host memory that has similar access semantics as System RAM i.e. DMA
* coherent and supports page pinning. In contrast to
* MEMORY_DEVICE_FS_DAX, this memory is access via a device-dax
* character device.
*
* MEMORY_DEVICE_PCI_P2PDMA: * MEMORY_DEVICE_PCI_P2PDMA:
* Device memory residing in a PCI BAR intended for use with Peer-to-Peer * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
* transactions. * transactions.
*/ */
enum memory_type { enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_PRIVATE = 1,
MEMORY_DEVICE_PUBLIC,
MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_DEVDAX,
MEMORY_DEVICE_PCI_P2PDMA, MEMORY_DEVICE_PCI_P2PDMA,
}; };
/* struct dev_pagemap_ops {
* Additional notes about MEMORY_DEVICE_PRIVATE may be found in /*
* include/linux/hmm.h and Documentation/vm/hmm.rst. There is also a brief * Called once the page refcount reaches 1. (ZONE_DEVICE pages never
* explanation in include/linux/memory_hotplug.h. * reach 0 refcount unless there is a refcount bug. This allows the
* * device driver to implement its own memory management.)
* The page_free() callback is called once the page refcount reaches 1 */
* (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug. void (*page_free)(struct page *page);
* This allows the device driver to implement its own memory management.)
*/ /*
typedef void (*dev_page_free_t)(struct page *page, void *data); * Transition the refcount in struct dev_pagemap to the dead state.
*/
void (*kill)(struct dev_pagemap *pgmap);
/*
* Wait for refcount in struct dev_pagemap to be idle and reap it.
*/
void (*cleanup)(struct dev_pagemap *pgmap);
/*
* Used for private (un-addressable) device memory only. Must migrate
* the page back to a CPU accessible page.
*/
vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);
};
#define PGMAP_ALTMAP_VALID (1 << 0)
/** /**
* struct dev_pagemap - metadata for ZONE_DEVICE mappings * struct dev_pagemap - metadata for ZONE_DEVICE mappings
* @page_free: free page callback when page refcount reaches 1
* @altmap: pre-allocated/reserved memory for vmemmap allocations * @altmap: pre-allocated/reserved memory for vmemmap allocations
* @res: physical address range covered by @ref * @res: physical address range covered by @ref
* @ref: reference count that pins the devm_memremap_pages() mapping * @ref: reference count that pins the devm_memremap_pages() mapping
* @kill: callback to transition @ref to the dead state * @internal_ref: internal reference if @ref is not provided by the caller
* @cleanup: callback to wait for @ref to be idle and reap it * @done: completion for @internal_ref
* @dev: host device of the mapping for debug * @dev: host device of the mapping for debug
* @data: private data pointer for page_free() * @data: private data pointer for page_free()
* @type: memory type: see MEMORY_* in memory_hotplug.h * @type: memory type: see MEMORY_* in memory_hotplug.h
* @flags: PGMAP_* flags to specify defailed behavior
* @ops: method table
*/ */
struct dev_pagemap { struct dev_pagemap {
dev_page_free_t page_free;
struct vmem_altmap altmap; struct vmem_altmap altmap;
bool altmap_valid;
struct resource res; struct resource res;
struct percpu_ref *ref; struct percpu_ref *ref;
void (*kill)(struct percpu_ref *ref); struct percpu_ref internal_ref;
void (*cleanup)(struct percpu_ref *ref); struct completion done;
struct device *dev; struct device *dev;
void *data;
enum memory_type type; enum memory_type type;
unsigned int flags;
u64 pci_p2pdma_bus_offset; u64 pci_p2pdma_bus_offset;
const struct dev_pagemap_ops *ops;
}; };
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
if (pgmap->flags & PGMAP_ALTMAP_VALID)
return &pgmap->altmap;
return NULL;
}
#ifdef CONFIG_ZONE_DEVICE #ifdef CONFIG_ZONE_DEVICE
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
......
...@@ -937,8 +937,6 @@ static inline bool is_zone_device_page(const struct page *page) ...@@ -937,8 +937,6 @@ static inline bool is_zone_device_page(const struct page *page)
#endif #endif
#ifdef CONFIG_DEV_PAGEMAP_OPS #ifdef CONFIG_DEV_PAGEMAP_OPS
void dev_pagemap_get_ops(void);
void dev_pagemap_put_ops(void);
void __put_devmap_managed_page(struct page *page); void __put_devmap_managed_page(struct page *page);
DECLARE_STATIC_KEY_FALSE(devmap_managed_key); DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
static inline bool put_devmap_managed_page(struct page *page) static inline bool put_devmap_managed_page(struct page *page)
...@@ -949,7 +947,6 @@ static inline bool put_devmap_managed_page(struct page *page) ...@@ -949,7 +947,6 @@ static inline bool put_devmap_managed_page(struct page *page)
return false; return false;
switch (page->pgmap->type) { switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_PUBLIC:
case MEMORY_DEVICE_FS_DAX: case MEMORY_DEVICE_FS_DAX:
__put_devmap_managed_page(page); __put_devmap_managed_page(page);
return true; return true;
...@@ -965,12 +962,6 @@ static inline bool is_device_private_page(const struct page *page) ...@@ -965,12 +962,6 @@ static inline bool is_device_private_page(const struct page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE; page->pgmap->type == MEMORY_DEVICE_PRIVATE;
} }
static inline bool is_device_public_page(const struct page *page)
{
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PUBLIC;
}
#ifdef CONFIG_PCI_P2PDMA #ifdef CONFIG_PCI_P2PDMA
static inline bool is_pci_p2pdma_page(const struct page *page) static inline bool is_pci_p2pdma_page(const struct page *page)
{ {
...@@ -985,14 +976,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page) ...@@ -985,14 +976,6 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
#endif /* CONFIG_PCI_P2PDMA */ #endif /* CONFIG_PCI_P2PDMA */
#else /* CONFIG_DEV_PAGEMAP_OPS */ #else /* CONFIG_DEV_PAGEMAP_OPS */
static inline void dev_pagemap_get_ops(void)
{
}
static inline void dev_pagemap_put_ops(void)
{
}
static inline bool put_devmap_managed_page(struct page *page) static inline bool put_devmap_managed_page(struct page *page)
{ {
return false; return false;
...@@ -1003,11 +986,6 @@ static inline bool is_device_private_page(const struct page *page) ...@@ -1003,11 +986,6 @@ static inline bool is_device_private_page(const struct page *page)
return false; return false;
} }
static inline bool is_device_public_page(const struct page *page)
{
return false;
}
static inline bool is_pci_p2pdma_page(const struct page *page) static inline bool is_pci_p2pdma_page(const struct page *page)
{ {
return false; return false;
...@@ -1436,10 +1414,8 @@ struct zap_details { ...@@ -1436,10 +1414,8 @@ struct zap_details {
pgoff_t last_index; /* Highest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */
}; };
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device); pte_t pte);
#define vm_normal_page(vma, addr, pte) _vm_normal_page(vma, addr, pte, false)
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd); pmd_t pmd);
......
...@@ -158,7 +158,7 @@ struct page { ...@@ -158,7 +158,7 @@ struct page {
struct { /* ZONE_DEVICE pages */ struct { /* ZONE_DEVICE pages */
/** @pgmap: Points to the hosting device page map. */ /** @pgmap: Points to the hosting device page map. */
struct dev_pagemap *pgmap; struct dev_pagemap *pgmap;
unsigned long hmm_data; void *zone_device_data;
unsigned long _zd_pad_1; /* uses mapping */ unsigned long _zd_pad_1; /* uses mapping */
}; };
...@@ -503,7 +503,7 @@ struct mm_struct { ...@@ -503,7 +503,7 @@ struct mm_struct {
#endif #endif
struct work_struct async_put_work; struct work_struct async_put_work;
#if IS_ENABLED(CONFIG_HMM) #ifdef CONFIG_HMM_MIRROR
/* HMM needs to track a few things per mm */ /* HMM needs to track a few things per mm */
struct hmm *hmm; struct hmm *hmm;
#endif #endif
......
...@@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) ...@@ -129,12 +129,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{ {
return pfn_to_page(swp_offset(entry)); return pfn_to_page(swp_offset(entry));
} }
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
swp_entry_t entry,
unsigned int flags,
pmd_t *pmdp);
#else /* CONFIG_DEVICE_PRIVATE */ #else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_device_private_entry(struct page *page, bool write) static inline swp_entry_t make_device_private_entry(struct page *page, bool write)
{ {
...@@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry) ...@@ -164,15 +158,6 @@ static inline struct page *device_private_entry_to_page(swp_entry_t entry)
{ {
return NULL; return NULL;
} }
static inline vm_fault_t device_private_entry_fault(struct vm_area_struct *vma,
unsigned long addr,
swp_entry_t entry,
unsigned int flags,
pmd_t *pmdp)
{
return VM_FAULT_SIGBUS;
}
#endif /* CONFIG_DEVICE_PRIVATE */ #endif /* CONFIG_DEVICE_PRIVATE */
#ifdef CONFIG_MIGRATION #ifdef CONFIG_MIGRATION
......
...@@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm) ...@@ -677,7 +677,6 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm); WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm); mm_free_pgd(mm);
destroy_context(mm); destroy_context(mm);
hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm); mmu_notifier_mm_destroy(mm);
check_mm(mm); check_mm(mm);
put_user_ns(mm->user_ns); put_user_ns(mm->user_ns);
......
...@@ -11,41 +11,39 @@ ...@@ -11,41 +11,39 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/wait_bit.h> #include <linux/wait_bit.h>
#include <linux/xarray.h> #include <linux/xarray.h>
#include <linux/hmm.h>
static DEFINE_XARRAY(pgmap_array); static DEFINE_XARRAY(pgmap_array);
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) #ifdef CONFIG_DEV_PAGEMAP_OPS
vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
unsigned long addr, EXPORT_SYMBOL(devmap_managed_key);
swp_entry_t entry, static atomic_t devmap_managed_enable;
unsigned int flags,
pmd_t *pmdp) static void devmap_managed_enable_put(void *data)
{ {
struct page *page = device_private_entry_to_page(entry); if (atomic_dec_and_test(&devmap_managed_enable))
struct hmm_devmem *devmem; static_branch_disable(&devmap_managed_key);
}
devmem = container_of(page->pgmap, typeof(*devmem), pagemap); static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
{
if (!pgmap->ops || !pgmap->ops->page_free) {
WARN(1, "Missing page_free method\n");
return -EINVAL;
}
/* if (atomic_inc_return(&devmap_managed_enable) == 1)
* The page_fault() callback must migrate page back to system memory static_branch_enable(&devmap_managed_key);
* so that CPU can access it. This might fail for various reasons return devm_add_action_or_reset(dev, devmap_managed_enable_put, NULL);
* (device issue, device was unsafely unplugged, ...). When such
* error conditions happen, the callback must return VM_FAULT_SIGBUS.
*
* Note that because memory cgroup charges are accounted to the device
* memory, this should never fail because of memory restrictions (but
* allocation of regular system page might still fail because we are
* out of memory).
*
* There is a more in-depth description of what that callback can and
* cannot do, in include/linux/memremap.h
*/
return devmem->page_fault(vma, addr, page, flags, pmdp);
} }
#endif /* CONFIG_DEVICE_PRIVATE */ #else
static int devmap_managed_enable_get(struct device *dev, struct dev_pagemap *pgmap)
{
return -EINVAL;
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
static void pgmap_array_delete(struct resource *res) static void pgmap_array_delete(struct resource *res)
{ {
...@@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res) ...@@ -56,14 +54,8 @@ static void pgmap_array_delete(struct resource *res)
static unsigned long pfn_first(struct dev_pagemap *pgmap) static unsigned long pfn_first(struct dev_pagemap *pgmap)
{ {
const struct resource *res = &pgmap->res; return (pgmap->res.start >> PAGE_SHIFT) +
struct vmem_altmap *altmap = &pgmap->altmap; vmem_altmap_offset(pgmap_altmap(pgmap));
unsigned long pfn;
pfn = res->start >> PAGE_SHIFT;
if (pgmap->altmap_valid)
pfn += vmem_altmap_offset(altmap);
return pfn;
} }
static unsigned long pfn_end(struct dev_pagemap *pgmap) static unsigned long pfn_end(struct dev_pagemap *pgmap)
...@@ -83,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn) ...@@ -83,6 +75,24 @@ static unsigned long pfn_next(unsigned long pfn)
#define for_each_device_pfn(pfn, map) \ #define for_each_device_pfn(pfn, map) \
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn)) for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
{
if (pgmap->ops && pgmap->ops->kill)
pgmap->ops->kill(pgmap);
else
percpu_ref_kill(pgmap->ref);
}
static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
{
if (pgmap->ops && pgmap->ops->cleanup) {
pgmap->ops->cleanup(pgmap);
} else {
wait_for_completion(&pgmap->done);
percpu_ref_exit(pgmap->ref);
}
}
static void devm_memremap_pages_release(void *data) static void devm_memremap_pages_release(void *data)
{ {
struct dev_pagemap *pgmap = data; struct dev_pagemap *pgmap = data;
...@@ -92,10 +102,10 @@ static void devm_memremap_pages_release(void *data) ...@@ -92,10 +102,10 @@ static void devm_memremap_pages_release(void *data)
unsigned long pfn; unsigned long pfn;
int nid; int nid;
pgmap->kill(pgmap->ref); dev_pagemap_kill(pgmap);
for_each_device_pfn(pfn, pgmap) for_each_device_pfn(pfn, pgmap)
put_page(pfn_to_page(pfn)); put_page(pfn_to_page(pfn));
pgmap->cleanup(pgmap->ref); dev_pagemap_cleanup(pgmap);
/* pages are dead and unused, undo the arch mapping */ /* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1); align_start = res->start & ~(SECTION_SIZE - 1);
...@@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data) ...@@ -111,7 +121,7 @@ static void devm_memremap_pages_release(void *data)
align_size >> PAGE_SHIFT, NULL); align_size >> PAGE_SHIFT, NULL);
} else { } else {
arch_remove_memory(nid, align_start, align_size, arch_remove_memory(nid, align_start, align_size,
pgmap->altmap_valid ? &pgmap->altmap : NULL); pgmap_altmap(pgmap));
kasan_remove_zero_shadow(__va(align_start), align_size); kasan_remove_zero_shadow(__va(align_start), align_size);
} }
mem_hotplug_done(); mem_hotplug_done();
...@@ -122,20 +132,29 @@ static void devm_memremap_pages_release(void *data) ...@@ -122,20 +132,29 @@ static void devm_memremap_pages_release(void *data)
"%s: failed to free all reserved pages\n", __func__); "%s: failed to free all reserved pages\n", __func__);
} }
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
struct dev_pagemap *pgmap =
container_of(ref, struct dev_pagemap, internal_ref);
complete(&pgmap->done);
}
/** /**
* devm_memremap_pages - remap and provide memmap backing for the given resource * devm_memremap_pages - remap and provide memmap backing for the given resource
* @dev: hosting device for @res * @dev: hosting device for @res
* @pgmap: pointer to a struct dev_pagemap * @pgmap: pointer to a struct dev_pagemap
* *
* Notes: * Notes:
* 1/ At a minimum the res, ref and type members of @pgmap must be initialized * 1/ At a minimum the res and type members of @pgmap must be initialized
* by the caller before passing it to this function * by the caller before passing it to this function
* *
* 2/ The altmap field may optionally be initialized, in which case altmap_valid * 2/ The altmap field may optionally be initialized, in which case
* must be set to true * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
* *
* 3/ pgmap->ref must be 'live' on entry and will be killed and reaped * 3/ The ref field may optionally be provided, in which pgmap->ref must be
* at devm_memremap_pages_release() time, or if this routine fails. * 'live' on entry and will be killed and reaped at
* devm_memremap_pages_release() time, or if this routine fails.
* *
* 4/ res is expected to be a host memory range that could feasibly be * 4/ res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but * treated as a "System RAM" range, i.e. not a device mmio range, but
...@@ -144,22 +163,66 @@ static void devm_memremap_pages_release(void *data) ...@@ -144,22 +163,66 @@ static void devm_memremap_pages_release(void *data)
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{ {
resource_size_t align_start, align_size, align_end; resource_size_t align_start, align_size, align_end;
struct vmem_altmap *altmap = pgmap->altmap_valid ?
&pgmap->altmap : NULL;
struct resource *res = &pgmap->res; struct resource *res = &pgmap->res;
struct dev_pagemap *conflict_pgmap; struct dev_pagemap *conflict_pgmap;
struct mhp_restrictions restrictions = { struct mhp_restrictions restrictions = {
/* /*
* We do not want any optional features only our own memmap * We do not want any optional features only our own memmap
*/ */
.altmap = altmap, .altmap = pgmap_altmap(pgmap),
}; };
pgprot_t pgprot = PAGE_KERNEL; pgprot_t pgprot = PAGE_KERNEL;
int error, nid, is_ram; int error, nid, is_ram;
bool need_devmap_managed = true;
switch (pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
WARN(1, "Device private memory not supported\n");
return ERR_PTR(-EINVAL);
}
if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
WARN(1, "Missing migrate_to_ram method\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_FS_DAX:
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
break;
case MEMORY_DEVICE_DEVDAX:
case MEMORY_DEVICE_PCI_P2PDMA:
need_devmap_managed = false;
break;
default:
WARN(1, "Invalid pgmap type %d\n", pgmap->type);
break;
}
if (!pgmap->ref) {
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
return ERR_PTR(-EINVAL);
init_completion(&pgmap->done);
error = percpu_ref_init(&pgmap->internal_ref,
dev_pagemap_percpu_release, 0, GFP_KERNEL);
if (error)
return ERR_PTR(error);
pgmap->ref = &pgmap->internal_ref;
} else {
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
}
}
if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { if (need_devmap_managed) {
WARN(1, "Missing reference count teardown definition\n"); error = devmap_managed_enable_get(dev, pgmap);
return ERR_PTR(-EINVAL); if (error)
return ERR_PTR(error);
} }
align_start = res->start & ~(SECTION_SIZE - 1); align_start = res->start & ~(SECTION_SIZE - 1);
...@@ -241,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) ...@@ -241,7 +304,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT, move_pfn_range_to_zone(zone, align_start >> PAGE_SHIFT,
align_size >> PAGE_SHIFT, altmap); align_size >> PAGE_SHIFT, pgmap_altmap(pgmap));
} }
mem_hotplug_done(); mem_hotplug_done();
...@@ -271,9 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) ...@@ -271,9 +334,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
err_pfn_remap: err_pfn_remap:
pgmap_array_delete(res); pgmap_array_delete(res);
err_array: err_array:
pgmap->kill(pgmap->ref); dev_pagemap_kill(pgmap);
pgmap->cleanup(pgmap->ref); dev_pagemap_cleanup(pgmap);
return ERR_PTR(error); return ERR_PTR(error);
} }
EXPORT_SYMBOL_GPL(devm_memremap_pages); EXPORT_SYMBOL_GPL(devm_memremap_pages);
...@@ -287,7 +349,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages); ...@@ -287,7 +349,9 @@ EXPORT_SYMBOL_GPL(devm_memunmap_pages);
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{ {
/* number of pfns from base where pfn_to_page() is valid */ /* number of pfns from base where pfn_to_page() is valid */
return altmap->reserve + altmap->free; if (altmap)
return altmap->reserve + altmap->free;
return 0;
} }
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
...@@ -329,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, ...@@ -329,28 +393,6 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
EXPORT_SYMBOL_GPL(get_dev_pagemap); EXPORT_SYMBOL_GPL(get_dev_pagemap);
#ifdef CONFIG_DEV_PAGEMAP_OPS #ifdef CONFIG_DEV_PAGEMAP_OPS
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
EXPORT_SYMBOL(devmap_managed_key);
static atomic_t devmap_enable;
/*
* Toggle the static key for ->page_free() callbacks when dev_pagemap
* pages go idle.
*/
void dev_pagemap_get_ops(void)
{
if (atomic_inc_return(&devmap_enable) == 1)
static_branch_enable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_get_ops);
void dev_pagemap_put_ops(void)
{
if (atomic_dec_and_test(&devmap_enable))
static_branch_disable(&devmap_managed_key);
}
EXPORT_SYMBOL_GPL(dev_pagemap_put_ops);
void __put_devmap_managed_page(struct page *page) void __put_devmap_managed_page(struct page *page)
{ {
int count = page_ref_dec_return(page); int count = page_ref_dec_return(page);
...@@ -366,7 +408,7 @@ void __put_devmap_managed_page(struct page *page) ...@@ -366,7 +408,7 @@ void __put_devmap_managed_page(struct page *page)
mem_cgroup_uncharge(page); mem_cgroup_uncharge(page);
page->pgmap->page_free(page, page->pgmap->data); page->pgmap->ops->page_free(page);
} else if (!count) } else if (!count)
__put_page(page); __put_page(page);
} }
......
...@@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head) ...@@ -1628,6 +1628,45 @@ void resource_list_free(struct list_head *head)
} }
EXPORT_SYMBOL(resource_list_free); EXPORT_SYMBOL(resource_list_free);
#ifdef CONFIG_DEVICE_PRIVATE
/**
* devm_request_free_mem_region - find free region for device private memory
*
* @dev: device struct to bind the resource to
* @size: size in bytes of the device memory to add
* @base: resource tree to look in
*
* This function tries to find an empty range of physical address big enough to
* contain the new resource, so that it can later be hotplugged as ZONE_DEVICE
* memory, which in turn allocates struct pages.
*/
struct resource *devm_request_free_mem_region(struct device *dev,
struct resource *base, unsigned long size)
{
resource_size_t end, addr;
struct resource *res;
size = ALIGN(size, 1UL << PA_SECTION_SHIFT);
end = min_t(unsigned long, base->end, (1UL << MAX_PHYSMEM_BITS) - 1);
addr = end - size + 1UL;
for (; addr > size && addr >= base->start; addr -= size) {
if (region_intersects(addr, size, 0, IORES_DESC_NONE) !=
REGION_DISJOINT)
continue;
res = devm_request_mem_region(dev, addr, size, dev_name(dev));
if (!res)
return ERR_PTR(-ENOMEM);
res->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
return res;
}
return ERR_PTR(-ERANGE);
}
EXPORT_SYMBOL_GPL(devm_request_free_mem_region);
#endif /* CONFIG_DEVICE_PRIVATE */
static int __init strict_iomem(char *str) static int __init strict_iomem(char *str)
{ {
if (strstr(str, "relaxed")) if (strstr(str, "relaxed"))
......
...@@ -670,47 +670,17 @@ config ZONE_DEVICE ...@@ -670,47 +670,17 @@ config ZONE_DEVICE
If FS_DAX is enabled, then say Y. If FS_DAX is enabled, then say Y.
config ARCH_HAS_HMM_MIRROR
bool
default y
depends on (X86_64 || PPC64)
depends on MMU && 64BIT
config ARCH_HAS_HMM_DEVICE
bool
default y
depends on (X86_64 || PPC64)
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
depends on ARCH_HAS_ZONE_DEVICE
select XARRAY_MULTI
config ARCH_HAS_HMM
bool
default y
depends on (X86_64 || PPC64)
depends on ZONE_DEVICE
depends on MMU && 64BIT
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
config MIGRATE_VMA_HELPER config MIGRATE_VMA_HELPER
bool bool
config DEV_PAGEMAP_OPS config DEV_PAGEMAP_OPS
bool bool
config HMM
bool
select MMU_NOTIFIER
select MIGRATE_VMA_HELPER
config HMM_MIRROR config HMM_MIRROR
bool "HMM mirror CPU page table into a device page table" bool "HMM mirror CPU page table into a device page table"
depends on ARCH_HAS_HMM depends on (X86_64 || PPC64)
select HMM depends on MMU && 64BIT
select MMU_NOTIFIER
help help
Select HMM_MIRROR if you want to mirror range of the CPU page table of a Select HMM_MIRROR if you want to mirror range of the CPU page table of a
process into a device page table. Here, mirror means "keep synchronized". process into a device page table. Here, mirror means "keep synchronized".
...@@ -720,8 +690,7 @@ config HMM_MIRROR ...@@ -720,8 +690,7 @@ config HMM_MIRROR
config DEVICE_PRIVATE config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)" bool "Unaddressable device memory (GPU memory, ...)"
depends on ARCH_HAS_HMM depends on ZONE_DEVICE
select HMM
select DEV_PAGEMAP_OPS select DEV_PAGEMAP_OPS
help help
...@@ -729,17 +698,6 @@ config DEVICE_PRIVATE ...@@ -729,17 +698,6 @@ config DEVICE_PRIVATE
memory; i.e., memory that is only accessible from the device (or memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR. group of devices). You likely also want to select HMM_MIRROR.
config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM
select HMM
select DEV_PAGEMAP_OPS
help
Allows creation of struct pages to represent addressable device
memory; i.e., memory that is accessible from both the device and
the CPU
config FRAME_VECTOR config FRAME_VECTOR
bool bool
......
...@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o ...@@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o
...@@ -609,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, ...@@ -609,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap; goto unmap;
*page = pte_page(*pte); *page = pte_page(*pte);
/*
* This should never happen (a device public page in the gate
* area).
*/
if (is_device_public_page(*page))
goto unmap;
} }
if (unlikely(!try_get_page(*page))) { if (unlikely(!try_get_page(*page))) {
ret = -ENOMEM; ret = -ENOMEM;
......
This diff is collapsed.
...@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, ...@@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
continue; continue;
} }
page = _vm_normal_page(vma, addr, ptent, true); page = vm_normal_page(vma, addr, ptent);
if (!page) if (!page)
continue; continue;
......
...@@ -4908,7 +4908,7 @@ enum mc_target_type { ...@@ -4908,7 +4908,7 @@ enum mc_target_type {
static struct page *mc_handle_present_pte(struct vm_area_struct *vma, static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent) unsigned long addr, pte_t ptent)
{ {
struct page *page = _vm_normal_page(vma, addr, ptent, true); struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page)) if (!page || !page_mapped(page))
return NULL; return NULL;
...@@ -5109,8 +5109,8 @@ static int mem_cgroup_move_account(struct page *page, ...@@ -5109,8 +5109,8 @@ static int mem_cgroup_move_account(struct page *page,
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored * target for charge migration. if @target is not NULL, the entry is stored
* in target->ent. * in target->ent.
* 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
* or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). * (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all * For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a * intent and purposes it is just special memory taking the place of a
* regular page. * regular page.
...@@ -5144,8 +5144,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, ...@@ -5144,8 +5144,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/ */
if (page->mem_cgroup == mc.from) { if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE; ret = MC_TARGET_PAGE;
if (is_device_private_page(page) || if (is_device_private_page(page))
is_device_public_page(page))
ret = MC_TARGET_DEVICE; ret = MC_TARGET_DEVICE;
if (target) if (target)
target->page = page; target->page = page;
...@@ -5216,8 +5215,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, ...@@ -5216,8 +5215,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
if (ptl) { if (ptl) {
/* /*
* Note their can not be MC_TARGET_DEVICE for now as we do not * Note their can not be MC_TARGET_DEVICE for now as we do not
* support transparent huge page with MEMORY_DEVICE_PUBLIC or * support transparent huge page with MEMORY_DEVICE_PRIVATE but
* MEMORY_DEVICE_PRIVATE but this might change. * this might change.
*/ */
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR; mc.precharge += HPAGE_PMD_NR;
......
...@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, ...@@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto unlock; goto unlock;
} }
switch (pgmap->type) { if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
case MEMORY_DEVICE_PRIVATE:
case MEMORY_DEVICE_PUBLIC:
/* /*
* TODO: Handle HMM pages which may need coordination * TODO: Handle HMM pages which may need coordination
* with device-side memory. * with device-side memory.
*/ */
goto unlock; goto unlock;
default:
break;
} }
/* /*
......
...@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, ...@@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
* PFNMAP mappings in order to support COWable mappings. * PFNMAP mappings in order to support COWable mappings.
* *
*/ */
struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte, bool with_public_device) pte_t pte)
{ {
unsigned long pfn = pte_pfn(pte); unsigned long pfn = pte_pfn(pte);
...@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, ...@@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
return NULL; return NULL;
if (is_zero_pfn(pfn)) if (is_zero_pfn(pfn))
return NULL; return NULL;
/*
* Device public pages are special pages (they are ZONE_DEVICE
* pages but different from persistent memory). They behave
* allmost like normal pages. The difference is that they are
* not on the lru and thus should never be involve with any-
* thing that involve lru manipulation (mlock, numa balancing,
* ...).
*
* This is why we still want to return NULL for such page from
* vm_normal_page() so that we do not have to special case all
* call site of vm_normal_page().
*/
if (likely(pfn <= highest_memmap_pfn)) {
struct page *page = pfn_to_page(pfn);
if (is_device_public_page(page)) {
if (with_public_device)
return page;
return NULL;
}
}
if (pte_devmap(pte)) if (pte_devmap(pte))
return NULL; return NULL;
...@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
rss[mm_counter(page)]++; rss[mm_counter(page)]++;
} else if (pte_devmap(pte)) { } else if (pte_devmap(pte)) {
page = pte_page(pte); page = pte_page(pte);
/*
* Cache coherent device memory behave like regular page and
* not like persistent memory page. For more informations see
* MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
*/
if (is_device_public_page(page)) {
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
}
} }
out_set_pte: out_set_pte:
...@@ -1063,7 +1029,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, ...@@ -1063,7 +1029,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
if (pte_present(ptent)) { if (pte_present(ptent)) {
struct page *page; struct page *page;
page = _vm_normal_page(vma, addr, ptent, true); page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) { if (unlikely(details) && page) {
/* /*
* unmap_shared_mapping_pages() wants to * unmap_shared_mapping_pages() wants to
...@@ -2777,13 +2743,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -2777,13 +2743,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
migration_entry_wait(vma->vm_mm, vmf->pmd, migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address); vmf->address);
} else if (is_device_private_entry(entry)) { } else if (is_device_private_entry(entry)) {
/* vmf->page = device_private_entry_to_page(entry);
* For un-addressable device memory we call the pgmap ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
* fault handler callback. The callback must migrate
* the page back to some CPU accessible page.
*/
ret = device_private_entry_fault(vma, vmf->address, entry,
vmf->flags, vmf->pmd);
} else if (is_hwpoison_entry(entry)) { } else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON; ret = VM_FAULT_HWPOISON;
} else { } else {
......
...@@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, ...@@ -557,10 +557,8 @@ void __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
int sections_to_remove; int sections_to_remove;
/* In the ZONE_DEVICE case device driver owns the memory region */ /* In the ZONE_DEVICE case device driver owns the memory region */
if (is_dev_zone(zone)) { if (is_dev_zone(zone))
if (altmap) map_offset = vmem_altmap_offset(altmap);
map_offset = vmem_altmap_offset(altmap);
}
clear_zone_contiguous(zone); clear_zone_contiguous(zone);
......
...@@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, ...@@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
out: out:
return page; return page;
} }
EXPORT_SYMBOL(alloc_pages_vma);
/** /**
* alloc_pages_current - Allocate pages. * alloc_pages_current - Allocate pages.
......
...@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, ...@@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
if (is_device_private_page(new)) { if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte)); entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry); pte = swp_entry_to_pte(entry);
} else if (is_device_public_page(new)) {
pte = pte_mkdevmap(pte);
} }
} }
...@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) ...@@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* ZONE_DEVICE pages. * ZONE_DEVICE pages.
*/ */
expected_count += is_device_private_page(page); expected_count += is_device_private_page(page);
expected_count += is_device_public_page(page);
if (mapping) if (mapping)
expected_count += hpage_nr_pages(page) + page_has_private(page); expected_count += hpage_nr_pages(page) + page_has_private(page);
...@@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, ...@@ -994,10 +991,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
if (!PageMappingFlags(page)) if (!PageMappingFlags(page))
page->mapping = NULL; page->mapping = NULL;
if (unlikely(is_zone_device_page(newpage))) { if (likely(!is_zone_device_page(newpage)))
if (is_device_public_page(newpage))
flush_dcache_page(newpage);
} else
flush_dcache_page(newpage); flush_dcache_page(newpage);
} }
...@@ -2265,7 +2259,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ...@@ -2265,7 +2259,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pfn = 0; pfn = 0;
goto next; goto next;
} }
page = _vm_normal_page(migrate->vma, addr, pte, true); page = vm_normal_page(migrate->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
} }
...@@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page) ...@@ -2406,16 +2400,7 @@ static bool migrate_vma_check_page(struct page *page)
* FIXME proper solution is to rework migration_entry_wait() so * FIXME proper solution is to rework migration_entry_wait() so
* it does not need to take a reference on page. * it does not need to take a reference on page.
*/ */
if (is_device_private_page(page)) return is_device_private_page(page);
return true;
/*
* Only allow device public page to be migrated and account for
* the extra reference count imply by ZONE_DEVICE pages.
*/
if (!is_device_public_page(page))
return false;
extra++;
} }
/* For file back page */ /* For file back page */
...@@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, ...@@ -2665,11 +2650,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry); entry = swp_entry_to_pte(swp_entry);
} else if (is_device_public_page(page)) {
entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
entry = pte_mkdevmap(entry);
} }
} else { } else {
entry = mk_pte(page, vma->vm_page_prot); entry = mk_pte(page, vma->vm_page_prot);
...@@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) ...@@ -2789,7 +2769,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue; continue;
} }
} else if (!is_device_public_page(newpage)) { } else {
/* /*
* Other types of ZONE_DEVICE page are not * Other types of ZONE_DEVICE page are not
* supported. * supported.
......
...@@ -5925,6 +5925,7 @@ void __ref memmap_init_zone_device(struct zone *zone, ...@@ -5925,6 +5925,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
{ {
unsigned long pfn, end_pfn = start_pfn + size; unsigned long pfn, end_pfn = start_pfn + size;
struct pglist_data *pgdat = zone->zone_pgdat; struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned long zone_idx = zone_idx(zone); unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies; unsigned long start = jiffies;
int nid = pgdat->node_id; int nid = pgdat->node_id;
...@@ -5937,9 +5938,7 @@ void __ref memmap_init_zone_device(struct zone *zone, ...@@ -5937,9 +5938,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* of the pages reserved for the memmap, so we can just jump to * of the pages reserved for the memmap, so we can just jump to
* the end of that region and start processing the device pages. * the end of that region and start processing the device pages.
*/ */
if (pgmap->altmap_valid) { if (altmap) {
struct vmem_altmap *altmap = &pgmap->altmap;
start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
size = end_pfn - start_pfn; size = end_pfn - start_pfn;
} }
...@@ -5959,12 +5958,12 @@ void __ref memmap_init_zone_device(struct zone *zone, ...@@ -5959,12 +5958,12 @@ void __ref memmap_init_zone_device(struct zone *zone,
__SetPageReserved(page); __SetPageReserved(page);
/* /*
* ZONE_DEVICE pages union ->lru with a ->pgmap back * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
* pointer and hmm_data. It is a bug if a ZONE_DEVICE * and zone_device_data. It is a bug if a ZONE_DEVICE page is
* page is ever freed or placed on a driver-private list. * ever freed or placed on a driver-private list.
*/ */
page->pgmap = pgmap; page->pgmap = pgmap;
page->hmm_data = 0; page->zone_device_data = NULL;
/* /*
* Mark the block movable so that blocks are reserved for * Mark the block movable so that blocks are reserved for
......
...@@ -740,15 +740,20 @@ void release_pages(struct page **pages, int nr) ...@@ -740,15 +740,20 @@ void release_pages(struct page **pages, int nr)
if (is_huge_zero_page(page)) if (is_huge_zero_page(page))
continue; continue;
/* Device public page can not be huge page */ if (is_zone_device_page(page)) {
if (is_device_public_page(page)) {
if (locked_pgdat) { if (locked_pgdat) {
spin_unlock_irqrestore(&locked_pgdat->lru_lock, spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags); flags);
locked_pgdat = NULL; locked_pgdat = NULL;
} }
put_devmap_managed_page(page); /*
continue; * ZONE_DEVICE pages that return 'false' from
* put_devmap_managed_page() do not require special
* processing, and instead, expect a call to
* put_page_testzero().
*/
if (put_devmap_managed_page(page))
continue;
} }
page = compound_head(page); page = compound_head(page);
......
...@@ -100,25 +100,60 @@ static void nfit_test_kill(void *_pgmap) ...@@ -100,25 +100,60 @@ static void nfit_test_kill(void *_pgmap)
{ {
struct dev_pagemap *pgmap = _pgmap; struct dev_pagemap *pgmap = _pgmap;
WARN_ON(!pgmap || !pgmap->ref || !pgmap->kill || !pgmap->cleanup); WARN_ON(!pgmap || !pgmap->ref);
pgmap->kill(pgmap->ref);
pgmap->cleanup(pgmap->ref); if (pgmap->ops && pgmap->ops->kill)
pgmap->ops->kill(pgmap);
else
percpu_ref_kill(pgmap->ref);
if (pgmap->ops && pgmap->ops->cleanup) {
pgmap->ops->cleanup(pgmap);
} else {
wait_for_completion(&pgmap->done);
percpu_ref_exit(pgmap->ref);
}
}
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
struct dev_pagemap *pgmap =
container_of(ref, struct dev_pagemap, internal_ref);
complete(&pgmap->done);
} }
void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
{ {
int error;
resource_size_t offset = pgmap->res.start; resource_size_t offset = pgmap->res.start;
struct nfit_test_resource *nfit_res = get_nfit_res(offset); struct nfit_test_resource *nfit_res = get_nfit_res(offset);
if (nfit_res) { if (!nfit_res)
int rc; return devm_memremap_pages(dev, pgmap);
rc = devm_add_action_or_reset(dev, nfit_test_kill, pgmap); pgmap->dev = dev;
if (rc) if (!pgmap->ref) {
return ERR_PTR(rc); if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
return nfit_res->buf + offset - nfit_res->res.start; return ERR_PTR(-EINVAL);
init_completion(&pgmap->done);
error = percpu_ref_init(&pgmap->internal_ref,
dev_pagemap_percpu_release, 0, GFP_KERNEL);
if (error)
return ERR_PTR(error);
pgmap->ref = &pgmap->internal_ref;
} else {
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
}
} }
return devm_memremap_pages(dev, pgmap);
error = devm_add_action_or_reset(dev, nfit_test_kill, pgmap);
if (error)
return ERR_PTR(error);
return nfit_res->buf + offset - nfit_res->res.start;
} }
EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages); EXPORT_SYMBOL_GPL(__wrap_devm_memremap_pages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment