Commit 7d3bf613 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm updates from Dan Williams:
 "This adds a user for the new 'bytes-remaining' updates to
  memcpy_mcsafe() that you already received through Ingo via the
  x86-dax- for-linus pull.

  Not included here, but still targeting this cycle, is support for
  handling memory media errors (poison) consumed via userspace dax
  mappings.

  Summary:

   - DAX broke a fundamental assumption of truncate of file mapped
     pages. The truncate path assumed that it is safe to disconnect a
     pinned page from a file and let the filesystem reclaim the physical
     block. With DAX the page is equivalent to the filesystem block.
     Introduce dax_layout_busy_page() to enable filesystems to wait for
     pinned DAX pages to be released. Without this wait a filesystem
     could allocate blocks under active device-DMA to a new file.

   - DAX arranges for the block layer to be bypassed and uses
     dax_direct_access() + copy_to_iter() to satisfy read(2) calls.
     However, the memcpy_mcsafe() facility is available through the pmem
     block driver. In order to safely handle media errors, via the DAX
     block-layer bypass, introduce copy_to_iter_mcsafe().

   - Fix cache management policy relative to the ACPI NFIT Platform
     Capabilities Structure to properly elide cache flushes when they
     are not necessary. The table indicates whether CPU caches are
     power-fail protected. Clarify that a deep flush is always performed
     on REQ_{FUA,PREFLUSH} requests"

* tag 'libnvdimm-for-4.18' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (21 commits)
  dax: Use dax_write_cache* helpers
  libnvdimm, pmem: Do not flush power-fail protected CPU caches
  libnvdimm, pmem: Unconditionally deep flush on *sync
  libnvdimm, pmem: Complete REQ_FLUSH => REQ_PREFLUSH
  acpi, nfit: Remove ecc_unit_size
  dax: dax_insert_mapping_entry always succeeds
  libnvdimm, e820: Register all pmem resources
  libnvdimm: Debug probe times
  linvdimm, pmem: Preserve read-only setting for pmem devices
  x86, nfit_test: Add unit test for memcpy_mcsafe()
  pmem: Switch to copy_to_iter_mcsafe()
  dax: Report bytes remaining in dax_iomap_actor()
  dax: Introduce a ->copy_to_iter dax operation
  uio, lib: Fix CONFIG_ARCH_HAS_UACCESS_MCSAFE compilation
  xfs, dax: introduce xfs_break_dax_layouts()
  xfs: prepare xfs_break_layouts() for another layout type
  xfs: prepare xfs_break_layouts() to be called with XFS_MMAPLOCK_EXCL
  mm, fs, dax: handle layout changes to pinned dax mappings
  mm: fix __gup_device_huge vs unmap
  mm: introduce MEMORY_DEVICE_FS_DAX and CONFIG_DEV_PAGEMAP_OPS
  ...
parents a3818841 930218af
What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size
Date: Aug, 2017
KernelVersion: v4.14 (Removed v4.18)
Contact: linux-nvdimm@lists.01.org
Description:
(RO) Size of a write request to a DIMM that will not incur a
read-modify-write cycle at the memory controller.
When the nfit driver initializes it runs an ARS (Address Range
Scrub) operation across every pmem range. Part of that process
involves determining the ARS capabilities of a given address
range. One of the capabilities that is reported is the 'Clear
Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
This property indicates the boundary at which the NVDIMM may
need to perform read-modify-write cycles to maintain ECC (Error
Correcting Code) blocks.
......@@ -212,22 +212,3 @@ Description:
range. Used by NVDIMM Region Mapping Structure to uniquely refer
to this structure. Value of 0 is reserved and not used as an
index.
What: /sys/bus/nd/devices/regionX/nfit/ecc_unit_size
Date: Aug, 2017
KernelVersion: v4.14
Contact: linux-nvdimm@lists.01.org
Description:
(RO) Size of a write request to a DIMM that will not incur a
read-modify-write cycle at the memory controller.
When the nfit driver initializes it runs an ARS (Address Range
Scrub) operation across every pmem range. Part of that process
involves determining the ARS capabilities of a given address
range. One of the capabilities that is reported is the 'Clear
Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
This property indicates the boundary at which the NVDIMM may
need to perform read-modify-write cycles to maintain ECC (Error
Correcting Code) blocks.
......@@ -72,6 +72,9 @@ config EARLY_PRINTK_USB_XDBC
You should normally say N here, unless you want to debug early
crashes or need a very simple printk logging facility.
config MCSAFE_TEST
def_bool n
config X86_PTDUMP_CORE
def_bool n
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MCSAFE_TEST_H_
#define _MCSAFE_TEST_H_
#ifndef __ASSEMBLY__
#ifdef CONFIG_MCSAFE_TEST
extern unsigned long mcsafe_test_src;
extern unsigned long mcsafe_test_dst;
static inline void mcsafe_inject_src(void *addr)
{
if (addr)
mcsafe_test_src = (unsigned long) addr;
else
mcsafe_test_src = ~0UL;
}
static inline void mcsafe_inject_dst(void *addr)
{
if (addr)
mcsafe_test_dst = (unsigned long) addr;
else
mcsafe_test_dst = ~0UL;
}
#else /* CONFIG_MCSAFE_TEST */
static inline void mcsafe_inject_src(void *addr)
{
}
static inline void mcsafe_inject_dst(void *addr)
{
}
#endif /* CONFIG_MCSAFE_TEST */
#else /* __ASSEMBLY__ */
#include <asm/export.h>
#ifdef CONFIG_MCSAFE_TEST
.macro MCSAFE_TEST_CTL
.pushsection .data
.align 8
.globl mcsafe_test_src
mcsafe_test_src:
.quad 0
EXPORT_SYMBOL_GPL(mcsafe_test_src)
.globl mcsafe_test_dst
mcsafe_test_dst:
.quad 0
EXPORT_SYMBOL_GPL(mcsafe_test_dst)
.popsection
.endm
.macro MCSAFE_TEST_SRC reg count target
leaq \count(\reg), %r9
cmp mcsafe_test_src, %r9
ja \target
.endm
.macro MCSAFE_TEST_DST reg count target
leaq \count(\reg), %r9
cmp mcsafe_test_dst, %r9
ja \target
.endm
#else
.macro MCSAFE_TEST_CTL
.endm
.macro MCSAFE_TEST_SRC reg count target
.endm
.macro MCSAFE_TEST_DST reg count target
.endm
#endif /* CONFIG_MCSAFE_TEST */
#endif /* __ASSEMBLY__ */
#endif /* _MCSAFE_TEST_H_ */
......@@ -3,6 +3,7 @@
#include <linux/linkage.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
#include <asm/mcsafe_test.h>
#include <asm/alternative-asm.h>
#include <asm/export.h>
......@@ -183,6 +184,9 @@ ENTRY(memcpy_orig)
ENDPROC(memcpy_orig)
#ifndef CONFIG_UML
MCSAFE_TEST_CTL
/*
* __memcpy_mcsafe - memory copy with machine check exception handling
* Note that we only catch machine checks when reading the source addresses.
......@@ -206,6 +210,8 @@ ENTRY(__memcpy_mcsafe)
subl %ecx, %edx
.L_read_leading_bytes:
movb (%rsi), %al
MCSAFE_TEST_SRC %rsi 1 .E_leading_bytes
MCSAFE_TEST_DST %rdi 1 .E_leading_bytes
.L_write_leading_bytes:
movb %al, (%rdi)
incq %rsi
......@@ -221,6 +227,8 @@ ENTRY(__memcpy_mcsafe)
.L_read_words:
movq (%rsi), %r8
MCSAFE_TEST_SRC %rsi 8 .E_read_words
MCSAFE_TEST_DST %rdi 8 .E_write_words
.L_write_words:
movq %r8, (%rdi)
addq $8, %rsi
......@@ -237,6 +245,8 @@ ENTRY(__memcpy_mcsafe)
movl %edx, %ecx
.L_read_trailing_bytes:
movb (%rsi), %al
MCSAFE_TEST_SRC %rsi 1 .E_trailing_bytes
MCSAFE_TEST_DST %rdi 1 .E_trailing_bytes
.L_write_trailing_bytes:
movb %al, (%rdi)
incq %rsi
......
......@@ -1978,19 +1978,8 @@ static ssize_t range_index_show(struct device *dev,
}
static DEVICE_ATTR_RO(range_index);
static ssize_t ecc_unit_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nd_region *nd_region = to_nd_region(dev);
struct nfit_spa *nfit_spa = nd_region_provider_data(nd_region);
return sprintf(buf, "%d\n", nfit_spa->clear_err_unit);
}
static DEVICE_ATTR_RO(ecc_unit_size);
static struct attribute *acpi_nfit_region_attributes[] = {
&dev_attr_range_index.attr,
&dev_attr_ecc_unit_size.attr,
NULL,
};
......
......@@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
{
struct dax_device *dax_dev;
bool dax_enabled = false;
pgoff_t pgoff;
int err, id;
void *kaddr;
......@@ -134,14 +135,21 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize)
* on being able to do (page_address(pfn_to_page())).
*/
WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
dax_enabled = true;
} else if (pfn_t_devmap(pfn)) {
/* pass */;
} else {
struct dev_pagemap *pgmap;
pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
if (pgmap && pgmap->type == MEMORY_DEVICE_FS_DAX)
dax_enabled = true;
put_dev_pagemap(pgmap);
}
if (!dax_enabled) {
pr_debug("%s: error: dax support not enabled\n",
bdevname(bdev, buf));
return false;
}
return true;
}
EXPORT_SYMBOL_GPL(__bdev_dax_supported);
......@@ -182,8 +190,7 @@ static ssize_t write_cache_show(struct device *dev,
if (!dax_dev)
return -ENXIO;
rc = sprintf(buf, "%d\n", !!test_bit(DAXDEV_WRITE_CACHE,
&dax_dev->flags));
rc = sprintf(buf, "%d\n", !!dax_write_cache_enabled(dax_dev));
put_dax(dax_dev);
return rc;
}
......@@ -201,10 +208,8 @@ static ssize_t write_cache_store(struct device *dev,
if (rc)
len = rc;
else if (write_cache)
set_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
else
clear_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags);
dax_write_cache(dax_dev, write_cache);
put_dax(dax_dev);
return len;
......@@ -282,11 +287,21 @@ size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
}
EXPORT_SYMBOL_GPL(dax_copy_from_iter);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i)
{
if (!dax_alive(dax_dev))
return 0;
return dax_dev->ops->copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
EXPORT_SYMBOL_GPL(dax_copy_to_iter);
#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{
if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
if (unlikely(!dax_write_cache_enabled(dax_dev)))
return;
arch_wb_cache_pmem(addr, size);
......
......@@ -185,9 +185,24 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}
static size_t linear_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
struct linear_c *lc = ti->private;
struct block_device *bdev = lc->dev->bdev;
struct dax_device *dax_dev = lc->dev->dax_dev;
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
dev_sector = linear_map_sector(ti, sector);
if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
return 0;
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
#else
#define linear_dax_direct_access NULL
#define linear_dax_copy_from_iter NULL
#define linear_dax_copy_to_iter NULL
#endif
static struct target_type linear_target = {
......@@ -204,6 +219,7 @@ static struct target_type linear_target = {
.iterate_devices = linear_iterate_devices,
.direct_access = linear_dax_direct_access,
.dax_copy_from_iter = linear_dax_copy_from_iter,
.dax_copy_to_iter = linear_dax_copy_to_iter,
};
int __init dm_linear_init(void)
......
......@@ -962,9 +962,23 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
dax_copy:
return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
}
static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
pgoff_t pgoff, void *addr, size_t bytes,
struct iov_iter *i)
{
struct log_writes_c *lc = ti->private;
sector_t sector = pgoff * PAGE_SECTORS;
if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
return 0;
return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
}
#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_copy_from_iter NULL
#define log_writes_dax_copy_to_iter NULL
#endif
static struct target_type log_writes_target = {
......@@ -982,6 +996,7 @@ static struct target_type log_writes_target = {
.io_hints = log_writes_io_hints,
.direct_access = log_writes_dax_direct_access,
.dax_copy_from_iter = log_writes_dax_copy_from_iter,
.dax_copy_to_iter = log_writes_dax_copy_to_iter,
};
static int __init dm_log_writes_init(void)
......
......@@ -354,9 +354,29 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
}
static size_t stripe_dax_copy_to_iter(struct dm_target *ti, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
sector_t dev_sector, sector = pgoff * PAGE_SECTORS;
struct stripe_c *sc = ti->private;
struct dax_device *dax_dev;
struct block_device *bdev;
uint32_t stripe;
stripe_map_sector(sc, sector, &stripe, &dev_sector);
dev_sector += sc->stripe[stripe].physical_start;
dax_dev = sc->stripe[stripe].dev->dax_dev;
bdev = sc->stripe[stripe].dev->bdev;
if (bdev_dax_pgoff(bdev, dev_sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
return 0;
return dax_copy_to_iter(dax_dev, pgoff, addr, bytes, i);
}
#else
#define stripe_dax_direct_access NULL
#define stripe_dax_copy_from_iter NULL
#define stripe_dax_copy_to_iter NULL
#endif
/*
......@@ -478,6 +498,7 @@ static struct target_type stripe_target = {
.io_hints = stripe_io_hints,
.direct_access = stripe_dax_direct_access,
.dax_copy_from_iter = stripe_dax_copy_from_iter,
.dax_copy_to_iter = stripe_dax_copy_to_iter,
};
int __init dm_stripe_init(void)
......
......@@ -1089,6 +1089,30 @@ static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return ret;
}
static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
struct mapped_device *md = dax_get_private(dax_dev);
sector_t sector = pgoff * PAGE_SECTORS;
struct dm_target *ti;
long ret = 0;
int srcu_idx;
ti = dm_dax_get_live_target(md, sector, &srcu_idx);
if (!ti)
goto out;
if (!ti->type->dax_copy_to_iter) {
ret = copy_to_iter(addr, bytes, i);
goto out;
}
ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
out:
dm_put_live_table(md, srcu_idx);
return ret;
}
/*
* A target may call dm_accept_partial_bio only from the map routine. It is
* allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
......@@ -3137,6 +3161,7 @@ static const struct block_device_operations dm_blk_dops = {
static const struct dax_operations dm_dax_ops = {
.direct_access = dm_dax_direct_access,
.copy_from_iter = dm_dax_copy_from_iter,
.copy_to_iter = dm_dax_copy_to_iter,
};
/*
......
......@@ -100,6 +100,9 @@ static int nvdimm_bus_probe(struct device *dev)
if (!try_module_get(provider))
return -ENXIO;
dev_dbg(&nvdimm_bus->dev, "START: %s.probe(%s)\n",
dev->driver->name, dev_name(dev));
nvdimm_bus_probe_start(nvdimm_bus);
rc = nd_drv->probe(dev);
if (rc == 0)
......@@ -108,7 +111,7 @@ static int nvdimm_bus_probe(struct device *dev)
nd_region_disable(nvdimm_bus, dev);
nvdimm_bus_probe_end(nvdimm_bus);
dev_dbg(&nvdimm_bus->dev, "%s.probe(%s) = %d\n", dev->driver->name,
dev_dbg(&nvdimm_bus->dev, "END: %s.probe(%s) = %d\n", dev->driver->name,
dev_name(dev), rc);
if (rc != 0)
......@@ -566,14 +569,18 @@ int nvdimm_revalidate_disk(struct gendisk *disk)
{
struct device *dev = disk_to_dev(disk)->parent;
struct nd_region *nd_region = to_nd_region(dev->parent);
const char *pol = nd_region->ro ? "only" : "write";
int disk_ro = get_disk_ro(disk);
if (nd_region->ro == get_disk_ro(disk))
/*
* Upgrade to read-only if the region is read-only preserve as
* read-only if the disk is already read-only.
*/
if (disk_ro || nd_region->ro == disk_ro)
return 0;
dev_info(dev, "%s read-%s, marking %s read-%s\n",
dev_name(&nd_region->dev), pol, disk->disk_name, pol);
set_disk_ro(disk, nd_region->ro);
dev_info(dev, "%s read-only, marking %s read-only\n",
dev_name(&nd_region->dev), disk->disk_name);
set_disk_ro(disk, 1);
return 0;
......
......@@ -38,12 +38,27 @@ static int e820_range_to_nid(resource_size_t addr)
}
#endif
static int e820_register_one(struct resource *res, void *data)
{
struct nd_region_desc ndr_desc;
struct nvdimm_bus *nvdimm_bus = data;
memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.res = res;
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
ndr_desc.numa_node = e820_range_to_nid(res->start);
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
return -ENXIO;
return 0;
}
static int e820_pmem_probe(struct platform_device *pdev)
{
static struct nvdimm_bus_descriptor nd_desc;
struct device *dev = &pdev->dev;
struct nvdimm_bus *nvdimm_bus;
struct resource *p;
int rc = -ENXIO;
nd_desc.attr_groups = e820_pmem_attribute_groups;
nd_desc.provider_name = "e820";
......@@ -53,27 +68,15 @@ static int e820_pmem_probe(struct platform_device *pdev)
goto err;
platform_set_drvdata(pdev, nvdimm_bus);
for (p = iomem_resource.child; p ; p = p->sibling) {
struct nd_region_desc ndr_desc;
if (p->desc != IORES_DESC_PERSISTENT_MEMORY_LEGACY)
continue;
memset(&ndr_desc, 0, sizeof(ndr_desc));
ndr_desc.res = p;
ndr_desc.attr_groups = e820_pmem_region_attribute_groups;
ndr_desc.numa_node = e820_range_to_nid(p->start);
set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags);
if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc))
goto err;
}
rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
if (rc)
goto err;
return 0;
err:
err:
nvdimm_bus_unregister(nvdimm_bus);
dev_err(dev, "failed to register legacy persistent memory ranges\n");
return -ENXIO;
return rc;
}
static struct platform_driver e820_pmem_driver = {
......
......@@ -561,8 +561,6 @@ static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap)
res->start += start_pad;
res->end -= end_trunc;
pgmap->type = MEMORY_DEVICE_HOST;
if (nd_pfn->mode == PFN_MODE_RAM) {
if (offset < SZ_8K)
return -EINVAL;
......
......@@ -164,11 +164,6 @@ static blk_status_t pmem_do_bvec(struct pmem_device *pmem, struct page *page,
return rc;
}
/* account for REQ_FLUSH rename, replace with REQ_PREFLUSH after v4.8-rc1 */
#ifndef REQ_FLUSH
#define REQ_FLUSH REQ_PREFLUSH
#endif
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
{
blk_status_t rc = 0;
......@@ -179,7 +174,7 @@ static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
struct pmem_device *pmem = q->queuedata;
struct nd_region *nd_region = to_region(pmem);
if (bio->bi_opf & REQ_FLUSH)
if (bio->bi_opf & REQ_PREFLUSH)
nvdimm_flush(nd_region);
do_acct = nd_iostat_start(bio, &start);
......@@ -264,9 +259,16 @@ static size_t pmem_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
return copy_from_iter_flushcache(addr, bytes, i);
}
static size_t pmem_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return copy_to_iter_mcsafe(addr, bytes, i);
}
static const struct dax_operations pmem_dax_ops = {
.direct_access = pmem_dax_direct_access,
.copy_from_iter = pmem_copy_from_iter,
.copy_to_iter = pmem_copy_to_iter,
};
static const struct attribute_group *pmem_attribute_groups[] = {
......@@ -294,12 +296,33 @@ static void pmem_release_disk(void *__pmem)
put_disk(pmem->disk);
}
static void pmem_release_pgmap_ops(void *__pgmap)
{
dev_pagemap_put_ops();
}
static void fsdax_pagefree(struct page *page, void *data)
{
wake_up_var(&page->_refcount);
}
static int setup_pagemap_fsdax(struct device *dev, struct dev_pagemap *pgmap)
{
dev_pagemap_get_ops();
if (devm_add_action_or_reset(dev, pmem_release_pgmap_ops, pgmap))
return -ENOMEM;
pgmap->type = MEMORY_DEVICE_FS_DAX;
pgmap->page_free = fsdax_pagefree;
return 0;
}
static int pmem_attach_disk(struct device *dev,
struct nd_namespace_common *ndns)
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(dev->parent);
int nid = dev_to_node(dev), fua, wbc;
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
struct resource bb_res;
struct nd_pfn *nd_pfn = NULL;
......@@ -335,7 +358,6 @@ static int pmem_attach_disk(struct device *dev,
dev_warn(dev, "unable to guarantee persistence of writes\n");
fua = 0;
}
wbc = nvdimm_has_cache(nd_region);
if (!devm_request_mem_region(dev, res->start, resource_size(res),
dev_name(&ndns->dev))) {
......@@ -353,6 +375,8 @@ static int pmem_attach_disk(struct device *dev,
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = &q->q_usage_counter;
if (is_nd_pfn(dev)) {
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pfn_sb = nd_pfn->pfn_sb;
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
......@@ -364,6 +388,8 @@ static int pmem_attach_disk(struct device *dev,
} else if (pmem_should_map_pages(dev)) {
memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res));
pmem->pgmap.altmap_valid = false;
if (setup_pagemap_fsdax(dev, &pmem->pgmap))
return -ENOMEM;
addr = devm_memremap_pages(dev, &pmem->pgmap);
pmem->pfn_flags |= PFN_MAP;
memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res));
......@@ -382,7 +408,7 @@ static int pmem_attach_disk(struct device *dev,
return PTR_ERR(addr);
pmem->virt_addr = addr;
blk_queue_write_cache(q, wbc, fua);
blk_queue_write_cache(q, true, fua);
blk_queue_make_request(q, pmem_make_request);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
......@@ -413,7 +439,7 @@ static int pmem_attach_disk(struct device *dev,
put_disk(disk);
return -ENOMEM;
}
dax_write_cache(dax_dev, wbc);
dax_write_cache(dax_dev, nvdimm_has_cache(nd_region));
pmem->dax_dev = dax_dev;
gendev = disk_to_dev(disk);
......
......@@ -1132,7 +1132,8 @@ EXPORT_SYMBOL_GPL(nvdimm_has_flush);
int nvdimm_has_cache(struct nd_region *nd_region)
{
return is_nd_pmem(&nd_region->dev);
return is_nd_pmem(&nd_region->dev) &&
!test_bit(ND_REGION_PERSIST_CACHE, &nd_region->flags);
}
EXPORT_SYMBOL_GPL(nvdimm_has_cache);
......
......@@ -51,9 +51,16 @@ static size_t dcssblk_dax_copy_from_iter(struct dax_device *dax_dev,
return copy_from_iter(addr, bytes, i);
}
static size_t dcssblk_dax_copy_to_iter(struct dax_device *dax_dev,
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
return copy_to_iter(addr, bytes, i);
}
static const struct dax_operations dcssblk_dax_ops = {
.direct_access = dcssblk_dax_direct_access,
.copy_from_iter = dcssblk_dax_copy_from_iter,
.copy_to_iter = dcssblk_dax_copy_to_iter,
};
struct dcssblk_dev_info {
......
......@@ -38,6 +38,7 @@ config FS_DAX
bool "Direct Access (DAX) support"
depends on MMU
depends on !(ARM || MIPS || SPARC)
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
select FS_IOMAP
select DAX
help
......
......@@ -351,6 +351,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping,
}
}
static struct page *dax_busy_page(void *entry)
{
unsigned long pfn;
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
if (page_ref_count(page) > 1)
return page;
}
return NULL;
}
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
......@@ -492,6 +505,90 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
return entry;
}
/**
* dax_layout_busy_page - find first pinned page in @mapping
* @mapping: address space to scan for a page with ref count > 1
*
* DAX requires ZONE_DEVICE mapped pages. These pages are never
* 'onlined' to the page allocator so they are considered idle when
* page->count == 1. A filesystem uses this interface to determine if
* any page in the mapping is busy, i.e. for DMA, or other
* get_user_pages() usages.
*
* It is expected that the filesystem is holding locks to block the
* establishment of new mappings in this address_space. I.e. it expects
* to be able to run unmap_mapping_range() and subsequently not race
* mapping_mapped() becoming true.
*/
struct page *dax_layout_busy_page(struct address_space *mapping)
{
pgoff_t indices[PAGEVEC_SIZE];
struct page *page = NULL;
struct pagevec pvec;
pgoff_t index, end;
unsigned i;
/*
* In the 'limited' case get_user_pages() for dax is disabled.
*/
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
return NULL;
if (!dax_mapping(mapping) || !mapping_mapped(mapping))
return NULL;
pagevec_init(&pvec);
index = 0;
end = -1;
/*
* If we race get_user_pages_fast() here either we'll see the
* elevated page count in the pagevec_lookup and wait, or
* get_user_pages_fast() will see that the page it took a reference
* against is no longer mapped in the page tables and bail to the
* get_user_pages() slow path. The slow path is protected by
* pte_lock() and pmd_lock(). New references are not taken without
* holding those locks, and unmap_mapping_range() will not zero the
* pte or pmd without holding the respective lock, so we are
* guaranteed to either see new references or prevent new
* references from being established.
*/
unmap_mapping_range(mapping, 0, 0, 1);
while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *pvec_ent = pvec.pages[i];
void *entry;
index = indices[i];
if (index >= end)
break;
if (!radix_tree_exceptional_entry(pvec_ent))
continue;
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (entry)
page = dax_busy_page(entry);
put_unlocked_mapping_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
if (page)
break;
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
index++;
if (page)
break;
}
return page;
}
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
......@@ -912,7 +1009,6 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
unsigned long vaddr = vmf->address;
vm_fault_t ret = VM_FAULT_NOPAGE;
struct page *zero_page;
void *entry2;
pfn_t pfn;
zero_page = ZERO_PAGE(0);
......@@ -922,13 +1018,8 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
}
pfn = page_to_pfn_t(zero_page);
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS;
goto out;
}
dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
false);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
out:
trace_dax_load_hole(inode, vmf, ret);
......@@ -991,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iov_iter *iter = data;
loff_t end = pos + length, done = 0;
ssize_t ret = 0;
size_t xfer;
int id;
if (iov_iter_rw(iter) == READ) {
......@@ -1054,18 +1146,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* vfs_write(), depending on which operation we are doing.
*/
if (iov_iter_rw(iter) == WRITE)
map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr,
xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
map_len, iter);
else
map_len = copy_to_iter(kaddr, map_len, iter);
if (map_len <= 0) {
ret = map_len ? map_len : -EFAULT;
break;
}
xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
map_len, iter);
pos += map_len;
length -= map_len;
done += map_len;
pos += xfer;
length -= xfer;
done += xfer;
if (xfer == 0)
ret = -EFAULT;
if (xfer < map_len)
break;
}
dax_read_unlock(id);
......@@ -1240,10 +1334,6 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
0, write && !sync);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto error_finish_iomap;
}
/*
* If we are doing synchronous page fault and inode needs fsync,
......@@ -1324,8 +1414,6 @@ static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
pfn = page_to_pfn_t(zero_page);
ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret))
goto fallback;
ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (!pmd_none(*(vmf->pmd))) {
......@@ -1447,8 +1535,6 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
RADIX_DAX_PMD, write && !sync);
if (IS_ERR(entry))
goto finish_iomap;
/*
* If we are doing synchronous page fault and inode needs fsync,
......
......@@ -312,7 +312,7 @@ xfs_file_aio_write_checks(
if (error <= 0)
return error;
error = xfs_break_layouts(inode, iolock);
error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
if (error)
return error;
......@@ -731,6 +731,69 @@ xfs_file_write_iter(
return xfs_file_buffered_aio_write(iocb, from);
}
static void
xfs_wait_dax_page(
struct inode *inode,
bool *did_unlock)
{
struct xfs_inode *ip = XFS_I(inode);
*did_unlock = true;
xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
schedule();
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}
static int
xfs_break_dax_layouts(
struct inode *inode,
uint iolock,
bool *did_unlock)
{
struct page *page;
ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
return 0;
return ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
0, 0, xfs_wait_dax_page(inode, did_unlock));
}
int
xfs_break_layouts(
struct inode *inode,
uint *iolock,
enum layout_break_reason reason)
{
bool retry;
int error;
ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
do {
retry = false;
switch (reason) {
case BREAK_UNMAP:
error = xfs_break_dax_layouts(inode, *iolock, &retry);
if (error || retry)
break;
/* fall through */
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, &retry);
break;
default:
WARN_ON_ONCE(1);
error = -EINVAL;
}
} while (error == 0 && retry);
return error;
}
#define XFS_FALLOC_FL_SUPPORTED \
(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
......@@ -747,7 +810,7 @@ xfs_file_fallocate(
struct xfs_inode *ip = XFS_I(inode);
long error;
enum xfs_prealloc_flags flags = 0;
uint iolock = XFS_IOLOCK_EXCL;
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
loff_t new_size = 0;
bool do_file_insert = false;
......@@ -757,13 +820,10 @@ xfs_file_fallocate(
return -EOPNOTSUPP;
xfs_ilock(ip, iolock);
error = xfs_break_layouts(inode, &iolock);
error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
if (error)
goto out_unlock;
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock |= XFS_MMAPLOCK_EXCL;
if (mode & FALLOC_FL_PUNCH_HOLE) {
error = xfs_free_file_space(ip, offset, len);
if (error)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment