Commit a3841f94 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm and dax updates from Dan Williams:
 "Save for a few late fixes, all of these commits have shipped in -next
  releases since before the merge window opened, and 0day has given a
  build success notification.

  The ext4 touches came from Jan, and the xfs touches have Darrick's
  reviewed-by. An xfstest for the MAP_SYNC feature has been through
  a few round of reviews and is on track to be merged.

   - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable
     'userspace flush' of persistent memory updates via filesystem-dax
     mappings. It arranges for any filesystem metadata updates that may
     be required to satisfy a write fault to also be flushed ("on disk")
     before the kernel returns to userspace from the fault handler.
     Effectively every write-fault that dirties metadata completes an
     fsync() before returning from the fault handler. The new
     MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag
     is validated as supported by the filesystem's ->mmap() file
     operation.

   - Add support for the standard ACPI 6.2 label access methods that
     replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods.
     This enables interoperability with environments that only implement
     the standardized methods.

   - Add support for the ACPI 6.2 NVDIMM media error injection methods.

   - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for
     latch last shutdown status, firmware update, SMART error injection,
     and SMART alarm threshold control.

   - Cleanup physical address information disclosures to be root-only.

   - Fix revalidation of the DIMM "locked label area" status to support
     dynamic unlock of the label area.

   - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA
     (system-physical-address) command and error injection commands.

  Acknowledgements that came after the commits were pushed to -next:

   - 957ac8c4 ("dax: fix PMD faults on zero-length files"):
Reviewed-by: default avatarRoss Zwisler <ross.zwisler@linux.intel.com>

   - a39e596b ("xfs: support for synchronous DAX faults") and
     7b565c9f ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()")
        Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>"

* tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits)
  acpi, nfit: add 'Enable Latch System Shutdown Status' command support
  dax: fix general protection fault in dax_alloc_inode
  dax: fix PMD faults on zero-length files
  dax: stop requiring a live device for dax_flush()
  brd: remove dax support
  dax: quiet bdev_dax_supported()
  fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core
  tools/testing/nvdimm: unit test clear-error commands
  acpi, nfit: validate commands against the device type
  tools/testing/nvdimm: stricter bounds checking for error injection commands
  xfs: support for synchronous DAX faults
  xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()
  ext4: Support for synchronous DAX faults
  ext4: Simplify error handling in ext4_dax_huge_fault()
  dax: Implement dax_finish_sync_fault()
  dax, iomap: Add support for synchronous faults
  mm: Define MAP_SYNC and VM_SYNC flags
  dax: Allow tuning whether dax_insert_mapping_entry() dirties entry
  dax: Allow dax_iomap_fault() to return pfn
  dax: Fix comment describing dax_iomap_fault()
  ...
parents adeba81a 4247f24c
......@@ -4208,7 +4208,7 @@ L: linux-i2c@vger.kernel.org
S: Maintained
F: drivers/i2c/busses/i2c-diolan-u2c.c
DIRECT ACCESS (DAX)
FILESYSTEM DIRECT ACCESS (DAX)
M: Matthew Wilcox <mawilcox@microsoft.com>
M: Ross Zwisler <ross.zwisler@linux.intel.com>
L: linux-fsdevel@vger.kernel.org
......@@ -4217,6 +4217,12 @@ F: fs/dax.c
F: include/linux/dax.h
F: include/trace/events/fs_dax.h
DEVICE DIRECT ACCESS (DAX)
M: Dan Williams <dan.j.williams@intel.com>
L: linux-nvdimm@lists.01.org
S: Supported
F: drivers/dax/
DIRECTORY NOTIFICATION (DNOTIFY)
M: Jan Kara <jack@suse.cz>
R: Amir Goldstein <amir73il@gmail.com>
......
......@@ -12,6 +12,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */
#define MAP_FIXED 0x100 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x10 /* don't use a file */
......
......@@ -29,6 +29,7 @@
*/
#define MAP_SHARED 0x001 /* Share changes */
#define MAP_PRIVATE 0x002 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */
#define MAP_TYPE 0x00f /* Mask for type of mapping */
#define MAP_FIXED 0x010 /* Interpret addr exactly */
......
......@@ -12,6 +12,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x03 /* Mask for type of mapping */
#define MAP_FIXED 0x04 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x10 /* don't use a file */
......
......@@ -36,6 +36,7 @@
*/
#define MAP_SHARED 0x001 /* Share changes */
#define MAP_PRIVATE 0x002 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */
#define MAP_TYPE 0x00f /* Mask for type of mapping */
#define MAP_FIXED 0x010 /* Interpret addr exactly */
......
This diff is collapsed.
......@@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
continue;
/* If this fails due to an -ENOMEM, there is little we can do */
nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
ALIGN(mce->addr, L1_CACHE_BYTES),
L1_CACHE_BYTES);
nvdimm_region_notify(nfit_spa->nd_region,
......
......@@ -24,7 +24,7 @@
/* ACPI 6.1 */
#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */
/* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */
#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
/* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
......@@ -38,6 +38,37 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
#define NVDIMM_STANDARD_CMDMASK \
(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
| 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
| 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
| 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
/*
* Command numbers that the kernel needs to know about to handle
* non-default DSM revision ids
*/
enum nvdimm_family_cmds {
NVDIMM_INTEL_LATCH_SHUTDOWN = 10,
NVDIMM_INTEL_GET_MODES = 11,
NVDIMM_INTEL_GET_FWINFO = 12,
NVDIMM_INTEL_START_FWUPDATE = 13,
NVDIMM_INTEL_SEND_FWUPDATE = 14,
NVDIMM_INTEL_FINISH_FWUPDATE = 15,
NVDIMM_INTEL_QUERY_FWUPDATE = 16,
NVDIMM_INTEL_SET_THRESHOLD = 17,
NVDIMM_INTEL_INJECT_ERROR = 18,
};
#define NVDIMM_INTEL_CMDMASK \
(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \
| 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
| 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
| 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
| 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN)
enum nfit_uuids {
/* for simplicity alias the uuid index with the family id */
NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
......@@ -140,6 +171,9 @@ struct nfit_mem {
struct resource *flush_wpq;
unsigned long dsm_mask;
int family;
u32 has_lsi:1;
u32 has_lsr:1;
u32 has_lsw:1;
};
struct acpi_nfit_desc {
......@@ -167,6 +201,7 @@ struct acpi_nfit_desc {
unsigned int init_complete:1;
unsigned long dimm_cmd_force_en;
unsigned long bus_cmd_force_en;
unsigned long bus_nfit_cmd_force_en;
int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
void *iobuf, u64 len, int rw);
};
......
......@@ -302,7 +302,6 @@ config BLK_DEV_SX8
config BLK_DEV_RAM
tristate "RAM block device support"
select DAX if BLK_DEV_RAM_DAX
---help---
Saying Y here will allow you to use a portion of your RAM memory as
a block device, so that you can make file systems on it, read and
......@@ -338,17 +337,6 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
config BLK_DEV_RAM_DAX
bool "Support Direct Access (DAX) to RAM block devices"
depends on BLK_DEV_RAM && FS_DAX
default n
help
Support filesystems using DAX to access RAM block devices. This
avoids double-buffering data in the page cache before copying it
to the block device. Answering Y will slightly enlarge the kernel,
and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media (DEPRECATED)"
depends on !UML
......
......@@ -21,11 +21,6 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#ifdef CONFIG_BLK_DEV_RAM_DAX
#include <linux/pfn_t.h>
#include <linux/dax.h>
#include <linux/uio.h>
#endif
#include <linux/uaccess.h>
......@@ -45,9 +40,6 @@ struct brd_device {
struct request_queue *brd_queue;
struct gendisk *brd_disk;
#ifdef CONFIG_BLK_DEV_RAM_DAX
struct dax_device *dax_dev;
#endif
struct list_head brd_list;
/*
......@@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
* restriction might be able to be lifted.
*/
gfp_flags = GFP_NOIO | __GFP_ZERO;
#ifndef CONFIG_BLK_DEV_RAM_DAX
gfp_flags |= __GFP_HIGHMEM;
#endif
page = alloc_page(gfp_flags);
if (!page)
return NULL;
......@@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
return err;
}
#ifdef CONFIG_BLK_DEV_RAM_DAX
static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
struct page *page;
if (!brd)
return -ENODEV;
page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT);
if (!page)
return -ENOSPC;
*kaddr = page_address(page);
*pfn = page_to_pfn_t(page);
return 1;
}
static long brd_dax_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
{
struct brd_device *brd = dax_get_private(dax_dev);
return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
}
static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return copy_from_iter(addr, bytes, i);
}
static const struct dax_operations brd_dax_ops = {
.direct_access = brd_dax_direct_access,
.copy_from_iter = brd_dax_copy_from_iter,
};
#endif
static const struct block_device_operations brd_fops = {
.owner = THIS_MODULE,
.rw_page = brd_rw_page,
......@@ -451,21 +403,8 @@ static struct brd_device *brd_alloc(int i)
set_capacity(disk, rd_size * 2);
disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
#ifdef CONFIG_BLK_DEV_RAM_DAX
queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
if (!brd->dax_dev)
goto out_free_inode;
#endif
return brd;
#ifdef CONFIG_BLK_DEV_RAM_DAX
out_free_inode:
kill_dax(brd->dax_dev);
put_dax(brd->dax_dev);
#endif
out_free_queue:
blk_cleanup_queue(brd->brd_queue);
out_free_dev:
......@@ -505,10 +444,6 @@ static struct brd_device *brd_init_one(int i, bool *new)
static void brd_del_one(struct brd_device *brd)
{
list_del(&brd->brd_list);
#ifdef CONFIG_BLK_DEV_RAM_DAX
kill_dax(brd->dax_dev);
put_dax(brd->dax_dev);
#endif
del_gendisk(brd->brd_disk);
brd_free(brd);
}
......
......@@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
unsigned long size)
{
struct resource *res;
phys_addr_t phys;
/* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */
phys_addr_t uninitialized_var(phys);
int i;
for (i = 0; i < dev_dax->num_resources; i++) {
......
......@@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
long len;
if (blocksize != PAGE_SIZE) {
pr_err("VFS (%s): error: unsupported blocksize for dax\n",
pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
sb->s_id);
return -EINVAL;
}
err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
if (err) {
pr_err("VFS (%s): error: unaligned partition for dax\n",
pr_debug("VFS (%s): error: unaligned partition for dax\n",
sb->s_id);
return err;
}
dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
if (!dax_dev) {
pr_err("VFS (%s): error: device does not support dax\n",
pr_debug("VFS (%s): error: device does not support dax\n",
sb->s_id);
return -EOPNOTSUPP;
}
......@@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
put_dax(dax_dev);
if (len < 1) {
pr_err("VFS (%s): error: dax access failed (%ld)",
pr_debug("VFS (%s): error: dax access failed (%ld)\n",
sb->s_id, len);
return len < 0 ? len : -EIO;
}
......@@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{
if (unlikely(!dax_alive(dax_dev)))
return;
if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
return;
......@@ -344,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
struct inode *inode;
dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
if (!dax_dev)
return NULL;
inode = &dax_dev->inode;
inode->i_rdev = 0;
return inode;
......
......@@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o
libnvdimm-y += region.o
libnvdimm-y += namespace_devs.o
libnvdimm-y += label.o
libnvdimm-y += badrange.o
libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
libnvdimm-$(CONFIG_BTT) += btt_devs.o
libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o
......
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/libnvdimm.h>
#include <linux/badblocks.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/ctype.h>
#include <linux/ndctl.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/io.h>
#include "nd-core.h"
#include "nd.h"
void badrange_init(struct badrange *badrange)
{
INIT_LIST_HEAD(&badrange->list);
spin_lock_init(&badrange->lock);
}
EXPORT_SYMBOL_GPL(badrange_init);
static void append_badrange_entry(struct badrange *badrange,
struct badrange_entry *bre, u64 addr, u64 length)
{
lockdep_assert_held(&badrange->lock);
bre->start = addr;
bre->length = length;
list_add_tail(&bre->list, &badrange->list);
}
static int alloc_and_append_badrange_entry(struct badrange *badrange,
u64 addr, u64 length, gfp_t flags)
{
struct badrange_entry *bre;
bre = kzalloc(sizeof(*bre), flags);
if (!bre)
return -ENOMEM;
append_badrange_entry(badrange, bre, addr, length);
return 0;
}
static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
{
struct badrange_entry *bre, *bre_new;
spin_unlock(&badrange->lock);
bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
spin_lock(&badrange->lock);
if (list_empty(&badrange->list)) {
if (!bre_new)
return -ENOMEM;
append_badrange_entry(badrange, bre_new, addr, length);
return 0;
}
/*
* There is a chance this is a duplicate, check for those first.
* This will be the common case as ARS_STATUS returns all known
* errors in the SPA space, and we can't query it per region
*/
list_for_each_entry(bre, &badrange->list, list)
if (bre->start == addr) {
/* If length has changed, update this list entry */
if (bre->length != length)
bre->length = length;
kfree(bre_new);
return 0;
}
/*
* If not a duplicate or a simple length update, add the entry as is,
* as any overlapping ranges will get resolved when the list is consumed
* and converted to badblocks
*/
if (!bre_new)
return -ENOMEM;
append_badrange_entry(badrange, bre_new, addr, length);
return 0;
}
int badrange_add(struct badrange *badrange, u64 addr, u64 length)
{
int rc;
spin_lock(&badrange->lock);
rc = add_badrange(badrange, addr, length);
spin_unlock(&badrange->lock);
return rc;
}
EXPORT_SYMBOL_GPL(badrange_add);
void badrange_forget(struct badrange *badrange, phys_addr_t start,
unsigned int len)
{
struct list_head *badrange_list = &badrange->list;
u64 clr_end = start + len - 1;
struct badrange_entry *bre, *next;
spin_lock(&badrange->lock);
/*
* [start, clr_end] is the badrange interval being cleared.
* [bre->start, bre_end] is the badrange_list entry we're comparing
* the above interval against. The badrange list entry may need
* to be modified (update either start or length), deleted, or
* split into two based on the overlap characteristics
*/
list_for_each_entry_safe(bre, next, badrange_list, list) {
u64 bre_end = bre->start + bre->length - 1;
/* Skip intervals with no intersection */
if (bre_end < start)
continue;
if (bre->start > clr_end)
continue;
/* Delete completely overlapped badrange entries */
if ((bre->start >= start) && (bre_end <= clr_end)) {
list_del(&bre->list);
kfree(bre);
continue;
}
/* Adjust start point of partially cleared entries */
if ((start <= bre->start) && (clr_end > bre->start)) {
bre->length -= clr_end - bre->start + 1;
bre->start = clr_end + 1;
continue;
}
/* Adjust bre->length for partial clearing at the tail end */
if ((bre->start < start) && (bre_end <= clr_end)) {
/* bre->start remains the same */
bre->length = start - bre->start;
continue;
}
/*
* If clearing in the middle of an entry, we split it into
* two by modifying the current entry to represent one half of
* the split, and adding a new entry for the second half.
*/
if ((bre->start < start) && (bre_end > clr_end)) {
u64 new_start = clr_end + 1;
u64 new_len = bre_end - new_start + 1;
/* Add new entry covering the right half */
alloc_and_append_badrange_entry(badrange, new_start,
new_len, GFP_NOWAIT);
/* Adjust this entry to cover the left half */
bre->length = start - bre->start;
continue;
}
}
spin_unlock(&badrange->lock);
}
EXPORT_SYMBOL_GPL(badrange_forget);
static void set_badblock(struct badblocks *bb, sector_t s, int num)
{
dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
if (badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
}
/**
* __add_badblock_range() - Convert a physical address range to bad sectors
* @bb: badblocks instance to populate
* @ns_offset: namespace offset where the error range begins (in bytes)
* @len: number of bytes of badrange to be added
*
* This assumes that the range provided with (ns_offset, len) is within
* the bounds of physical addresses for this namespace, i.e. lies in the
* interval [ns_start, ns_start + ns_size)
*/
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
{
const unsigned int sector_size = 512;
sector_t start_sector, end_sector;
u64 num_sectors;
u32 rem;
start_sector = div_u64(ns_offset, sector_size);
end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
if (rem)
end_sector++;
num_sectors = end_sector - start_sector;
if (unlikely(num_sectors > (u64)INT_MAX)) {
u64 remaining = num_sectors;
sector_t s = start_sector;
while (remaining) {
int done = min_t(u64, remaining, INT_MAX);
set_badblock(bb, s, done);
remaining -= done;
s += done;
}
} else
set_badblock(bb, start_sector, num_sectors);
}
static void badblocks_populate(struct badrange *badrange,
struct badblocks *bb, const struct resource *res)
{
struct badrange_entry *bre;
if (list_empty(&badrange->list))
return;
list_for_each_entry(bre, &badrange->list, list) {
u64 bre_end = bre->start + bre->length - 1;
/* Discard intervals with no intersection */
if (bre_end < res->start)
continue;
if (bre->start > res->end)
continue;
/* Deal with any overlap after start of the namespace */
if (bre->start >= res->start) {
u64 start = bre->start;
u64 len;
if (bre_end <= res->end)
len = bre->length;
else
len = res->start + resource_size(res)
- bre->start;
__add_badblock_range(bb, start - res->start, len);
continue;
}
/*
* Deal with overlap for badrange starting before
* the namespace.
*/
if (bre->start < res->start) {
u64 len;
if (bre_end < res->end)
len = bre->start + bre->length - res->start;
else
len = resource_size(res);
__add_badblock_range(bb, 0, len);
}
}
}
/**
* nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
* @region: parent region of the range to interrogate
* @bb: badblocks instance to populate
* @res: resource range to consider
*
* The badrange list generated during bus initialization may contain
* multiple, possibly overlapping physical address ranges. Compare each
* of these ranges to the resource range currently being initialized,
* and add badblocks entries for all matching sub-ranges
*/
void nvdimm_badblocks_populate(struct nd_region *nd_region,
struct badblocks *bb, const struct resource *res)
{
struct nvdimm_bus *nvdimm_bus;
if (!is_memory(&nd_region->dev)) {
dev_WARN_ONCE(&nd_region->dev, 1,
"%s only valid for pmem regions\n", __func__);
return;
}
nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
nvdimm_bus_lock(&nvdimm_bus->dev);
badblocks_populate(&nvdimm_bus->badrange, bb, res);
nvdimm_bus_unlock(&nvdimm_bus->dev);
}
EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
......@@ -11,6 +11,7 @@
* General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/libnvdimm.h>
#include <linux/sched/mm.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
......@@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
phys_addr_t phys, u64 cleared)
{
if (cleared > 0)
nvdimm_forget_poison(nvdimm_bus, phys, cleared);
badrange_forget(&nvdimm_bus->badrange, phys, cleared);
if (cleared > 0 && cleared / 512)
nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
......@@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
return NULL;
INIT_LIST_HEAD(&nvdimm_bus->list);
INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
INIT_LIST_HEAD(&nvdimm_bus->poison_list);
init_waitqueue_head(&nvdimm_bus->probe_wait);
nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
mutex_init(&nvdimm_bus->reconfig_mutex);
spin_lock_init(&nvdimm_bus->poison_lock);
badrange_init(&nvdimm_bus->badrange);
if (nvdimm_bus->id < 0) {
kfree(nvdimm_bus);
return NULL;
......@@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
return 0;
}
static void free_poison_list(struct list_head *poison_list)
static void free_badrange_list(struct list_head *badrange_list)
{
struct nd_poison *pl, *next;
struct badrange_entry *bre, *next;
list_for_each_entry_safe(pl, next, poison_list, list) {
list_del(&pl->list);
kfree(pl);
list_for_each_entry_safe(bre, next, badrange_list, list) {
list_del(&bre->list);
kfree(bre);
}
list_del_init(poison_list);
list_del_init(badrange_list);
}
static int nd_bus_remove(struct device *dev)
......@@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
nd_synchronize();
device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
spin_lock(&nvdimm_bus->poison_lock);
free_poison_list(&nvdimm_bus->poison_list);
spin_unlock(&nvdimm_bus->poison_lock);
spin_lock(&nvdimm_bus->badrange.lock);
free_badrange_list(&nvdimm_bus->badrange.list);
spin_unlock(&nvdimm_bus->badrange.lock);
nvdimm_bus_destroy_ndctl(nvdimm_bus);
......
......@@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
};
EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
static void set_badblock(struct badblocks *bb, sector_t s, int num)
int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
if (badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
return badrange_add(&nvdimm_bus->badrange, addr, length);
}
/**
* __add_badblock_range() - Convert a physical address range to bad sectors
* @bb: badblocks instance to populate
* @ns_offset: namespace offset where the error range begins (in bytes)
* @len: number of bytes of poison to be added
*
* This assumes that the range provided with (ns_offset, len) is within
* the bounds of physical addresses for this namespace, i.e. lies in the
* interval [ns_start, ns_start + ns_size)
*/
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
{
const unsigned int sector_size = 512;
sector_t start_sector, end_sector;
u64 num_sectors;
u32 rem;
start_sector = div_u64(ns_offset, sector_size);
end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
if (rem)
end_sector++;
num_sectors = end_sector - start_sector;
if (unlikely(num_sectors > (u64)INT_MAX)) {
u64 remaining = num_sectors;
sector_t s = start_sector;
while (remaining) {
int done = min_t(u64, remaining, INT_MAX);
set_badblock(bb, s, done);
remaining -= done;
s += done;
}
} else
set_badblock(bb, start_sector, num_sectors);
}
static void badblocks_populate(struct list_head *poison_list,
struct badblocks *bb, const struct resource *res)
{
struct nd_poison *pl;
if (list_empty(poison_list))
return;
list_for_each_entry(pl, poison_list, list) {
u64 pl_end = pl->start + pl->length - 1;
/* Discard intervals with no intersection */
if (pl_end < res->start)
continue;
if (pl->start > res->end)
continue;
/* Deal with any overlap after start of the namespace */
if (pl->start >= res->start) {
u64 start = pl->start;
u64 len;
if (pl_end <= res->end)
len = pl->length;
else
len = res->start + resource_size(res)
- pl->start;
__add_badblock_range(bb, start - res->start, len);
continue;
}
/* Deal with overlap for poison starting before the namespace */
if (pl->start < res->start) {
u64 len;
if (pl_end < res->end)
len = pl->start + pl->length - res->start;
else
len = resource_size(res);
__add_badblock_range(bb, 0, len);
}
}
}
/**
* nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
* @region: parent region of the range to interrogate
* @bb: badblocks instance to populate
* @res: resource range to consider
*
* The poison list generated during bus initialization may contain
* multiple, possibly overlapping physical address ranges. Compare each
* of these ranges to the resource range currently being initialized,
* and add badblocks entries for all matching sub-ranges
*/
void nvdimm_badblocks_populate(struct nd_region *nd_region,
struct badblocks *bb, const struct resource *res)
{
struct nvdimm_bus *nvdimm_bus;
struct list_head *poison_list;
if (!is_memory(&nd_region->dev)) {
dev_WARN_ONCE(&nd_region->dev, 1,
"%s only valid for pmem regions\n", __func__);
return;
}
nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
poison_list = &nvdimm_bus->poison_list;
nvdimm_bus_lock(&nvdimm_bus->dev);
badblocks_populate(poison_list, bb, res);
nvdimm_bus_unlock(&nvdimm_bus->dev);
}
EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
struct nd_poison *pl, u64 addr, u64 length)
{
lockdep_assert_held(&nvdimm_bus->poison_lock);
pl->start = addr;
pl->length = length;
list_add_tail(&pl->list, &nvdimm_bus->poison_list);
}
static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
gfp_t flags)
{
struct nd_poison *pl;
pl = kzalloc(sizeof(*pl), flags);
if (!pl)
return -ENOMEM;
append_poison_entry(nvdimm_bus, pl, addr, length);
return 0;
}
static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
struct nd_poison *pl, *pl_new;
spin_unlock(&nvdimm_bus->poison_lock);
pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
spin_lock(&nvdimm_bus->poison_lock);
if (list_empty(&nvdimm_bus->poison_list)) {
if (!pl_new)
return -ENOMEM;
append_poison_entry(nvdimm_bus, pl_new, addr, length);
return 0;
}
/*
* There is a chance this is a duplicate, check for those first.
* This will be the common case as ARS_STATUS returns all known
* errors in the SPA space, and we can't query it per region
*/
list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
if (pl->start == addr) {
/* If length has changed, update this list entry */
if (pl->length != length)
pl->length = length;
kfree(pl_new);
return 0;
}
/*
* If not a duplicate or a simple length update, add the entry as is,
* as any overlapping ranges will get resolved when the list is consumed
* and converted to badblocks
*/
if (!pl_new)
return -ENOMEM;
append_poison_entry(nvdimm_bus, pl_new, addr, length);
return 0;
}
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
int rc;
spin_lock(&nvdimm_bus->poison_lock);
rc = bus_add_poison(nvdimm_bus, addr, length);
spin_unlock(&nvdimm_bus->poison_lock);
return rc;
}
EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
unsigned int len)
{
struct list_head *poison_list = &nvdimm_bus->poison_list;
u64 clr_end = start + len - 1;
struct nd_poison *pl, *next;
spin_lock(&nvdimm_bus->poison_lock);
WARN_ON_ONCE(list_empty(poison_list));
/*
* [start, clr_end] is the poison interval being cleared.
* [pl->start, pl_end] is the poison_list entry we're comparing
* the above interval against. The poison list entry may need
* to be modified (update either start or length), deleted, or
* split into two based on the overlap characteristics
*/
list_for_each_entry_safe(pl, next, poison_list, list) {
u64 pl_end = pl->start + pl->length - 1;
/* Skip intervals with no intersection */
if (pl_end < start)
continue;
if (pl->start > clr_end)
continue;
/* Delete completely overlapped poison entries */
if ((pl->start >= start) && (pl_end <= clr_end)) {
list_del(&pl->list);
kfree(pl);
continue;
}
/* Adjust start point of partially cleared entries */
if ((start <= pl->start) && (clr_end > pl->start)) {
pl->length -= clr_end - pl->start + 1;
pl->start = clr_end + 1;
continue;
}
/* Adjust pl->length for partial clearing at the tail end */
if ((pl->start < start) && (pl_end <= clr_end)) {
/* pl->start remains the same */
pl->length = start - pl->start;
continue;
}
/*
* If clearing in the middle of an entry, we split it into
* two by modifying the current entry to represent one half of
* the split, and adding a new entry for the second half.
*/
if ((pl->start < start) && (pl_end > clr_end)) {
u64 new_start = clr_end + 1;
u64 new_len = pl_end - new_start + 1;
/* Add new entry covering the right half */
add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
/* Adjust this entry to cover the left half */
pl->length = start - pl->start;
continue;
}
}
spin_unlock(&nvdimm_bus->poison_lock);
}
EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
#ifdef CONFIG_BLK_DEV_INTEGRITY
int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)
......
......@@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev)
goto err;
rc = nvdimm_init_config_data(ndd);
if (rc == -EACCES)
nvdimm_set_locked(dev);
if (rc)
goto err;
......@@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev)
rc = nd_label_reserve_dpa(ndd);
if (ndd->ns_current >= 0)
nvdimm_set_aliasing(dev);
nvdimm_clear_locked(dev);
nvdimm_bus_unlock(dev);
if (rc)
......
......@@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev)
set_bit(NDD_LOCKED, &nvdimm->flags);
}
void nvdimm_clear_locked(struct device *dev)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
clear_bit(NDD_LOCKED, &nvdimm->flags);
}
static void nvdimm_release(struct device *dev)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
......@@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev,
}
static DEVICE_ATTR_RO(commands);
static ssize_t flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
return sprintf(buf, "%s%s\n",
test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
}
static DEVICE_ATTR_RO(flags);
static ssize_t state_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
......@@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots);
static struct attribute *nvdimm_attributes[] = {
&dev_attr_state.attr,
&dev_attr_flags.attr,
&dev_attr_commands.attr,
&dev_attr_available_slots.attr,
NULL,
......
......@@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
nsindex = to_namespace_index(ndd, 0);
memset(nsindex, 0, ndd->nsarea.config_size);
for (i = 0; i < 2; i++) {
int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT);
int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
if (rc)
return rc;
......
......@@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj,
if (a == &dev_attr_resource.attr) {
if (is_namespace_blk(dev))
return 0;
return a->mode;
return 0400;
}
if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
......@@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
* @nspm: target namespace to create
* @nd_label: target pmem namespace label to evaluate
*/
struct device *create_namespace_pmem(struct nd_region *nd_region,
static struct device *create_namespace_pmem(struct nd_region *nd_region,
struct nd_namespace_index *nsindex,
struct nd_namespace_label *nd_label)
{
......@@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region,
return i;
}
struct device *create_namespace_blk(struct nd_region *nd_region,
static struct device *create_namespace_blk(struct nd_region *nd_region,
struct nd_namespace_label *nd_label, int count)
{
......
......@@ -29,10 +29,9 @@ struct nvdimm_bus {
struct list_head list;
struct device dev;
int id, probe_active;
struct list_head poison_list;
struct list_head mapping_list;
struct mutex reconfig_mutex;
spinlock_t poison_lock;
struct badrange badrange;
};
struct nvdimm {
......
......@@ -34,12 +34,6 @@ enum {
NVDIMM_IO_ATOMIC = 1,
};
struct nd_poison {
u64 start;
u64 length;
struct list_head list;
};
struct nvdimm_drvdata {
struct device *dev;
int nslabel_size;
......@@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
unsigned int len);
void nvdimm_set_aliasing(struct device *dev);
void nvdimm_set_locked(struct device *dev);
void nvdimm_clear_locked(struct device *dev);
struct nd_btt *to_nd_btt(struct device *dev);
struct nd_gen_sb {
......
......@@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = {
NULL,
};
static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
{
if (a == &dev_attr_resource.attr)
return 0400;
return a->mode;
}
struct attribute_group nd_pfn_attribute_group = {
.attrs = nd_pfn_attributes,
.is_visible = pfn_visible,
};
static const struct attribute_group *nd_pfn_attribute_groups[] = {
......
......@@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
return 0;
if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
return 0;
if (a == &dev_attr_resource.attr) {
if (is_nd_pmem(dev))
return 0400;
else
return 0;
}
if (a == &dev_attr_deep_flush.attr) {
int has_flush = nvdimm_has_flush(nd_region);
......
This diff is collapsed.
......@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
}
down_read(&ei->dax_sem);
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE)
......
......@@ -28,6 +28,7 @@
#include <linux/quotaops.h>
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
......@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
*/
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED);
pfn_t pfn;
if (write) {
sb_start_pagefault(sb);
......@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
if (!IS_ERR(handle))
result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
else
result = VM_FAULT_SIGBUS;
result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
ext4_journal_stop(handle);
/* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
} else {
......@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
file_accessed(file);
if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops;
......@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
......
......@@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
static bool ext4_inode_datasync_dirty(struct inode *inode)
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
if (journal)
return !jbd2_transaction_committed(journal,
EXT4_I(inode)->i_datasync_tid);
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
{
......@@ -3497,6 +3510,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
}
iomap->flags = 0;
if (ext4_inode_datasync_dirty(inode))
iomap->flags |= IOMAP_F_DIRTY;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev;
iomap->offset = first_block << blkbits;
......
......@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err;
}
/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
int ret = 1;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid)
ret = 0;
if (journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid)
ret = 0;
read_unlock(&journal->j_state_lock);
return ret;
}
EXPORT_SYMBOL(jbd2_transaction_committed);
/*
* When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start
......
......@@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
[ilog2(VM_SYNC)] = "sf",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd",
......
......@@ -44,6 +44,7 @@
#include <linux/falloc.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/mman.h>
static const struct vm_operations_struct xfs_file_vm_ops;
......@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
pfn_t pfn;
ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else {
if (write_fault)
ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
......@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
}
/*
* pfn_mkwrite was originally inteneded to ensure we capture time stamp
* updates on write faults. In reality, it's need to serialise against
* truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
* to ensure we serialise the fault barrier in place.
* pfn_mkwrite was originally intended to ensure we capture time stamp updates
* on write faults. In reality, it needs to serialise against truncate and
* prepare memory for writing so handle is as standard write fault.
*/
static int
xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
int ret = VM_FAULT_NOPAGE;
loff_t size;
trace_xfs_filemap_pfn_mkwrite(ip);
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
/* check if the faulting page hasn't raced with truncate */
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
else if (IS_DAX(inode))
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
static const struct vm_operations_struct xfs_file_vm_ops = {
......@@ -1136,6 +1120,13 @@ xfs_file_mmap(
struct file *filp,
struct vm_area_struct *vma)
{
/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp)))
......@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl,
#endif
.mmap = xfs_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open,
.release = xfs_file_release,
.fsync = xfs_file_fsync,
......
......@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_inode_item.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
......@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
trace_xfs_iomap_found(ip, offset, length, 0, &imap);
}
if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
& ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
xfs_bmbt_to_iomap(ip, iomap, &imap);
if (shared)
......
......@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault),
......
......@@ -96,7 +96,9 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops);
pfn_t *pfnp, const struct iomap_ops *ops);
int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
......
......@@ -1702,6 +1702,7 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
......
......@@ -21,9 +21,13 @@ struct vm_fault;
/*
* Flags for all iomap mappings:
*
* IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
* written data and requires fdatasync to commit them to persistent storage.
*/
#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */
#define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */
#define IOMAP_F_DIRTY 0x04 /* uncommitted metadata */
/*
* Flags that only need to be reported for IOMAP_REPORT requests:
......
......@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
......
......@@ -18,6 +18,18 @@
#include <linux/sizes.h>
#include <linux/types.h>
#include <linux/uuid.h>
#include <linux/spinlock.h>
struct badrange_entry {
u64 start;
u64 length;
struct list_head list;
};
struct badrange {
struct list_head list;
spinlock_t lock;
};
enum {
/* when a dimm supports both PMEM and BLK access a label is required */
......@@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
}
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
phys_addr_t start, unsigned int len);
void badrange_init(struct badrange *badrange);
int badrange_add(struct badrange *badrange, u64 addr, u64 length);
void badrange_forget(struct badrange *badrange, phys_addr_t start,
unsigned int len);
int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
u64 length);
struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
struct nvdimm_bus_descriptor *nfit_desc);
void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);
......
......@@ -199,6 +199,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_SYNC 0x00800000 /* Synchronous page faults */
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
......@@ -1191,8 +1192,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables
* and needs fsync() to complete (for
* synchronous page faults in DAX) */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
......@@ -1210,7 +1212,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
{ VM_FAULT_LOCKED, "LOCKED" }, \
{ VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" }
{ VM_FAULT_DONE_COW, "DONE_COW" }, \
{ VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
......
......@@ -8,6 +8,48 @@
#include <linux/atomic.h>
#include <uapi/linux/mman.h>
/*
* Arrange for legacy / undefined architecture specific flags to be
* ignored by mmap handling code.
*/
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif
/*
* The historical set of flags that all mmap implementations implicitly
* support when a ->mmap_validate() op is not provided in file_operations.
*/
#define LEGACY_MAP_MASK (MAP_SHARED \
| MAP_PRIVATE \
| MAP_FIXED \
| MAP_ANONYMOUS \
| MAP_DENYWRITE \
| MAP_EXECUTABLE \
| MAP_UNINITIALIZED \
| MAP_GROWSDOWN \
| MAP_LOCKED \
| MAP_NORESERVE \
| MAP_POPULATE \
| MAP_NONBLOCK \
| MAP_STACK \
| MAP_HUGETLB \
| MAP_32BIT \
| MAP_HUGE_2MB \
| MAP_HUGE_1GB)
extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes;
......@@ -64,8 +106,9 @@ static inline bool arch_validate_prot(unsigned long prot)
* ("bit1" and "bit2" must be single bits)
*/
#define _calc_vm_trans(x, bit1, bit2) \
((!(bit1) || !(bit2)) ? 0 : \
((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
: ((x) & (bit1)) / ((bit1) / (bit2)))
: ((x) & (bit1)) / ((bit1) / (bit2))))
/*
* Combine the mmap "prot" argument into "vm_flags" used internally.
......@@ -87,7 +130,8 @@ calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC );
}
unsigned long vm_commit_limit(void);
......
......@@ -149,7 +149,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
TP_ARGS(inode, vmf, length, pfn, radix_entry))
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
DECLARE_EVENT_CLASS(dax_pte_fault_class,
TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
......@@ -192,6 +191,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
DEFINE_PTE_FAULT_EVENT(dax_load_hole);
DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);
TRACE_EVENT(dax_insert_mapping,
TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),
......
......@@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */
......
......@@ -13,6 +13,7 @@
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */
/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
......
......@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) {
struct inode *inode = file_inode(file);
unsigned long flags_mask;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) {
case MAP_SHARED:
/*
* Force use of MAP_SHARED_VALIDATE with non-legacy
* flags. E.g. MAP_SYNC is dangerous to use with
* MAP_SHARED as you don't know which consistency model
* you will get. We silently ignore unsupported flags
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
/* fall through */
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES;
......
......@@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */
......
......@@ -70,6 +70,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
libnvdimm-y += $(NVDIMM_SRC)/region.o
libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
libnvdimm-y += $(NVDIMM_SRC)/label.o
libnvdimm-y += $(NVDIMM_SRC)/badrange.o
libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o
......
This diff is collapsed.
......@@ -32,6 +32,58 @@ struct nfit_test_resource {
void *buf;
};
#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA 2
#define NFIT_ARS_INJECT_INVALID 2
enum err_inj_options {
ND_ARS_ERR_INJ_OPT_NOTIFY = 0,
};
/* nfit commands */
enum nfit_cmd_num {
NFIT_CMD_TRANSLATE_SPA = 5,
NFIT_CMD_ARS_INJECT_SET = 7,
NFIT_CMD_ARS_INJECT_CLEAR = 8,
NFIT_CMD_ARS_INJECT_GET = 9,
};
struct nd_cmd_translate_spa {
__u64 spa;
__u32 status;
__u8 flags;
__u8 _reserved[3];
__u64 translate_length;
__u32 num_nvdimms;
struct nd_nvdimm_device {
__u32 nfit_device_handle;
__u32 _reserved;
__u64 dpa;
} __packed devices[0];
} __packed;
struct nd_cmd_ars_err_inj {
__u64 err_inj_spa_range_base;
__u64 err_inj_spa_range_length;
__u8 err_inj_options;
__u32 status;
} __packed;
struct nd_cmd_ars_err_inj_clr {
__u64 err_inj_clr_spa_range_base;
__u64 err_inj_clr_spa_range_length;
__u32 status;
} __packed;
struct nd_cmd_ars_err_inj_stat {
__u32 status;
__u32 inj_err_rec_count;
struct nd_error_stat_query_record {
__u64 err_inj_stat_spa_range_base;
__u64 err_inj_stat_spa_range_length;
} __packed record[0];
} __packed;
union acpi_object;
typedef void *acpi_handle;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment