Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm and dax updates from Dan Williams: "Save for a few late fixes, all of these commits have shipped in -next releases since before the merge window opened, and 0day has given a build success notification. The ext4 touches came from Jan, and the xfs touches have Darrick's reviewed-by. An xfstest for the MAP_SYNC feature has been through a few round of reviews and is on track to be merged. - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable 'userspace flush' of persistent memory updates via filesystem-dax mappings. It arranges for any filesystem metadata updates that may be required to satisfy a write fault to also be flushed ("on disk") before the kernel returns to userspace from the fault handler. Effectively every write-fault that dirties metadata completes an fsync() before returning from the fault handler. The new MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag is validated as supported by the filesystem's ->mmap() file operation. - Add support for the standard ACPI 6.2 label access methods that replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods. This enables interoperability with environments that only implement the standardized methods. - Add support for the ACPI 6.2 NVDIMM media error injection methods. - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for latch last shutdown status, firmware update, SMART error injection, and SMART alarm threshold control. - Cleanup physical address information disclosures to be root-only. - Fix revalidation of the DIMM "locked label area" status to support dynamic unlock of the label area. - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA (system-physical-address) command and error injection commands. Acknowledgements that came after the commits were pushed to -next: - 957ac8c4 ("dax: fix PMD faults on zero-length files"): Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> - a39e596b ("xfs: support for synchronous DAX faults") and 7b565c9f ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()") Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>" * tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits) acpi, nfit: add 'Enable Latch System Shutdown Status' command support dax: fix general protection fault in dax_alloc_inode dax: fix PMD faults on zero-length files dax: stop requiring a live device for dax_flush() brd: remove dax support dax: quiet bdev_dax_supported() fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core tools/testing/nvdimm: unit test clear-error commands acpi, nfit: validate commands against the device type tools/testing/nvdimm: stricter bounds checking for error injection commands xfs: support for synchronous DAX faults xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault() ext4: Support for synchronous DAX faults ext4: Simplify error handling in ext4_dax_huge_fault() dax: Implement dax_finish_sync_fault() dax, iomap: Add support for synchronous faults mm: Define MAP_SYNC and VM_SYNC flags dax: Allow tuning whether dax_insert_mapping_entry() dirties entry dax: Allow dax_iomap_fault() to return pfn dax: Fix comment describing dax_iomap_fault() ...

Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
Pull libnvdimm and dax updates from Dan Williams: "Save for a few late fixes, all of these commits have shipped in -next releases since before the merge window opened, and 0day has given a build success notification. The ext4 touches came from Jan, and the xfs touches have Darrick's reviewed-by. An xfstest for the MAP_SYNC feature has been through a few round of reviews and is on track to be merged. - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable 'userspace flush' of persistent memory updates via filesystem-dax mappings. It arranges for any filesystem metadata updates that may be required to satisfy a write fault to also be flushed ("on disk") before the kernel returns to userspace from the fault handler. Effectively every write-fault that dirties metadata completes an fsync() before returning from the fault handler. The new MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag is validated as supported by the filesystem's ->mmap() file operation. - Add support for the standard ACPI 6.2 label access methods that replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods. This enables interoperability with environments that only implement the standardized methods. - Add support for the ACPI 6.2 NVDIMM media error injection methods. - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for latch last shutdown status, firmware update, SMART error injection, and SMART alarm threshold control. - Cleanup physical address information disclosures to be root-only. - Fix revalidation of the DIMM "locked label area" status to support dynamic unlock of the label area. - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA (system-physical-address) command and error injection commands. Acknowledgements that came after the commits were pushed to -next: - 957ac8c4 ("dax: fix PMD faults on zero-length files"): Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> - a39e596b ("xfs: support for synchronous DAX faults") and 7b565c9f ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()") Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>" * tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits) acpi, nfit: add 'Enable Latch System Shutdown Status' command support dax: fix general protection fault in dax_alloc_inode dax: fix PMD faults on zero-length files dax: stop requiring a live device for dax_flush() brd: remove dax support dax: quiet bdev_dax_supported() fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core tools/testing/nvdimm: unit test clear-error commands acpi, nfit: validate commands against the device type tools/testing/nvdimm: stricter bounds checking for error injection commands xfs: support for synchronous DAX faults xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault() ext4: Support for synchronous DAX faults ext4: Simplify error handling in ext4_dax_huge_fault() dax: Implement dax_finish_sync_fault() dax, iomap: Add support for synchronous faults mm: Define MAP_SYNC and VM_SYNC flags dax: Allow tuning whether dax_insert_mapping_entry() dirties entry dax: Allow dax_iomap_fault() to return pfn dax: Fix comment describing dax_iomap_fault() ...
a3841f94 · Linus Torvalds · adeba81a · 4247f24c · a3841f94 · a3841f94
Commit a3841f94 authored Nov 17, 2017 by Linus Torvalds
48 changed files
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4208,7 +4208,7 @@ L:	linux-i2c@vger.kernel.org
 S:	Maintained
 F:	drivers/i2c/busses/i2c-diolan-u2c.c

-DIRECT ACCESS (DAX)
+FILESYSTEM DIRECT ACCESS (DAX)
 M:	Matthew Wilcox <mawilcox@microsoft.com>
 M:	Ross Zwisler <ross.zwisler@linux.intel.com>
 L:	linux-fsdevel@vger.kernel.org
@@ -4217,6 +4217,12 @@ F:	fs/dax.c
 F:	include/linux/dax.h
 F:	include/trace/events/fs_dax.h

+DEVICE DIRECT ACCESS (DAX)
+M:	Dan Williams <dan.j.williams@intel.com>
+L:	linux-nvdimm@lists.01.org
+S:	Supported
+F:	drivers/dax/
+
 DIRECTORY NOTIFICATION (DNOTIFY)
 M:	Jan Kara <jack@suse.cz>
 R:	Amir Goldstein <amir73il@gmail.com>

--- a/arch/alpha/include/uapi/asm/mman.h
+++ b/arch/alpha/include/uapi/asm/mman.h
@@ -12,6 +12,7 @@

 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping (OSF/1 is _wrong_) */
 #define MAP_FIXED	0x100		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */

--- a/arch/mips/include/uapi/asm/mman.h
+++ b/arch/mips/include/uapi/asm/mman.h
@@ -29,6 +29,7 @@
 */
 #define MAP_SHARED	0x001		/* Share changes */
 #define MAP_PRIVATE	0x002		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */


--- a/arch/parisc/include/uapi/asm/mman.h
+++ b/arch/parisc/include/uapi/asm/mman.h
@@ -12,6 +12,7 @@

 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x03		/* Mask for type of mapping */
 #define MAP_FIXED	0x04		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x10		/* don't use a file */

--- a/arch/xtensa/include/uapi/asm/mman.h
+++ b/arch/xtensa/include/uapi/asm/mman.h
@@ -36,6 +36,7 @@
 */
 #define MAP_SHARED	0x001		/* Share changes */
 #define MAP_PRIVATE	0x002		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x003	/* share + validate extension flags */
 #define MAP_TYPE	0x00f		/* Mask for type of mapping */
 #define MAP_FIXED	0x010		/* Interpret addr exactly */


--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
--- a/drivers/acpi/nfit/mce.c
+++ b/drivers/acpi/nfit/mce.c
@@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
 			continue;

 		/* If this fails due to an -ENOMEM, there is little we can do */
-		nvdimm_bus_add_poison(acpi_desc->nvdimm_bus,
+		nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
 				ALIGN(mce->addr, L1_CACHE_BYTES),
 				L1_CACHE_BYTES);
 		nvdimm_region_notify(nfit_spa->nd_region,

--- a/drivers/acpi/nfit/nfit.h
+++ b/drivers/acpi/nfit/nfit.h
@@ -24,7 +24,7 @@
 /* ACPI 6.1 */
 #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"

-/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */
+/* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */
 #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"

 /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
@@ -38,6 +38,37 @@
 		| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
 		| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)

+#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
+
+#define NVDIMM_STANDARD_CMDMASK \
+(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
+ | 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
+ | 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
+ | 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
+
+/*
+ * Command numbers that the kernel needs to know about to handle
+ * non-default DSM revision ids
+ */
+enum nvdimm_family_cmds {
+	NVDIMM_INTEL_LATCH_SHUTDOWN = 10,
+	NVDIMM_INTEL_GET_MODES = 11,
+	NVDIMM_INTEL_GET_FWINFO = 12,
+	NVDIMM_INTEL_START_FWUPDATE = 13,
+	NVDIMM_INTEL_SEND_FWUPDATE = 14,
+	NVDIMM_INTEL_FINISH_FWUPDATE = 15,
+	NVDIMM_INTEL_QUERY_FWUPDATE = 16,
+	NVDIMM_INTEL_SET_THRESHOLD = 17,
+	NVDIMM_INTEL_INJECT_ERROR = 18,
+};
+
+#define NVDIMM_INTEL_CMDMASK \
+(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \
+ | 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
+ | 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
+ | 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
+ | 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN)
+
 enum nfit_uuids {
 	/* for simplicity alias the uuid index with the family id */
 	NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
@@ -140,6 +171,9 @@ struct nfit_mem {
 	struct resource *flush_wpq;
 	unsigned long dsm_mask;
 	int family;
+	u32 has_lsi:1;
+	u32 has_lsr:1;
+	u32 has_lsw:1;
 };

 struct acpi_nfit_desc {
@@ -167,6 +201,7 @@ struct acpi_nfit_desc {
 	unsigned int init_complete:1;
 	unsigned long dimm_cmd_force_en;
 	unsigned long bus_cmd_force_en;
+	unsigned long bus_nfit_cmd_force_en;
 	int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
 			void *iobuf, u64 len, int rw);
 };

--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -302,7 +302,6 @@ config BLK_DEV_SX8

 config BLK_DEV_RAM
 	tristate "RAM block device support"
-	select DAX if BLK_DEV_RAM_DAX
 	---help---
 	  Saying Y here will allow you to use a portion of your RAM memory as
 	  a block device, so that you can make file systems on it, read and
@@ -338,17 +337,6 @@ config BLK_DEV_RAM_SIZE
 	  The default value is 4096 kilobytes. Only change this if you know
 	  what you are doing.

-config BLK_DEV_RAM_DAX
-	bool "Support Direct Access (DAX) to RAM block devices"
-	depends on BLK_DEV_RAM && FS_DAX
-	default n
-	help
-	  Support filesystems using DAX to access RAM block devices.  This
-	  avoids double-buffering data in the page cache before copying it
-	  to the block device.  Answering Y will slightly enlarge the kernel,
-	  and will prevent RAM block device backing store memory from being
-	  allocated from highmem (only a problem for highmem systems).
-
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media (DEPRECATED)"
 	depends on !UML

--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -21,11 +21,6 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-#include <linux/pfn_t.h>
-#include <linux/dax.h>
-#include <linux/uio.h>
-#endif

 #include <linux/uaccess.h>

@@ -45,9 +40,6 @@ struct brd_device {

 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	struct dax_device	*dax_dev;
-#endif
 	struct list_head	brd_list;

 	/*
@@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 	 * restriction might be able to be lifted.
 	 */
 	gfp_flags = GFP_NOIO | __GFP_ZERO;
-#ifndef CONFIG_BLK_DEV_RAM_DAX
-	gfp_flags |= __GFP_HIGHMEM;
-#endif
 	page = alloc_page(gfp_flags);
 	if (!page)
 		return NULL;
@@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
 	return err;
 }

-#ifdef CONFIG_BLK_DEV_RAM_DAX
-static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
-		long nr_pages, void **kaddr, pfn_t *pfn)
-{
-	struct page *page;
-
-	if (!brd)
-		return -ENODEV;
-	page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT);
-	if (!page)
-		return -ENOSPC;
-	*kaddr = page_address(page);
-	*pfn = page_to_pfn_t(page);
-
-	return 1;
-}
-
-static long brd_dax_direct_access(struct dax_device *dax_dev,
-		pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
-{
-	struct brd_device *brd = dax_get_private(dax_dev);
-
-	return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
-}
-
-static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
-		void *addr, size_t bytes, struct iov_iter *i)
-{
-	return copy_from_iter(addr, bytes, i);
-}
-
-static const struct dax_operations brd_dax_ops = {
-	.direct_access = brd_dax_direct_access,
-	.copy_from_iter = brd_dax_copy_from_iter,
-};
-#endif
-
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.rw_page =		brd_rw_page,
@@ -451,21 +403,8 @@ static struct brd_device *brd_alloc(int i)
 	set_capacity(disk, rd_size * 2);
 	disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;

-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
-	brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
-	if (!brd->dax_dev)
-		goto out_free_inode;
-#endif
-
-
 	return brd;

-#ifdef CONFIG_BLK_DEV_RAM_DAX
-out_free_inode:
-	kill_dax(brd->dax_dev);
-	put_dax(brd->dax_dev);
-#endif
 out_free_queue:
 	blk_cleanup_queue(brd->brd_queue);
 out_free_dev:
@@ -505,10 +444,6 @@ static struct brd_device *brd_init_one(int i, bool *new)
 static void brd_del_one(struct brd_device *brd)
 {
 	list_del(&brd->brd_list);
-#ifdef CONFIG_BLK_DEV_RAM_DAX
-	kill_dax(brd->dax_dev);
-	put_dax(brd->dax_dev);
-#endif
 	del_gendisk(brd->brd_disk);
 	brd_free(brd);
 }

--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
 		unsigned long size)
 {
 	struct resource *res;
-	phys_addr_t phys;
+	/* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */
+	phys_addr_t uninitialized_var(phys);
 	int i;

 	for (i = 0; i < dev_dax->num_resources; i++) {

--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	long len;

 	if (blocksize != PAGE_SIZE) {
-		pr_err("VFS (%s): error: unsupported blocksize for dax\n",
+		pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
 				sb->s_id);
 		return -EINVAL;
 	}

 	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
 	if (err) {
-		pr_err("VFS (%s): error: unaligned partition for dax\n",
+		pr_debug("VFS (%s): error: unaligned partition for dax\n",
 				sb->s_id);
 		return err;
 	}

 	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 	if (!dax_dev) {
-		pr_err("VFS (%s): error: device does not support dax\n",
+		pr_debug("VFS (%s): error: device does not support dax\n",
 				sb->s_id);
 		return -EOPNOTSUPP;
 	}
@@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
 	put_dax(dax_dev);

 	if (len < 1) {
-		pr_err("VFS (%s): error: dax access failed (%ld)",
+		pr_debug("VFS (%s): error: dax access failed (%ld)\n",
 				sb->s_id, len);
 		return len < 0 ? len : -EIO;
 	}
@@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
 void arch_wb_cache_pmem(void *addr, size_t size);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
 {
-	if (unlikely(!dax_alive(dax_dev)))
-		return;
-
 	if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
 		return;

@@ -344,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
 	struct inode *inode;

 	dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
+	if (!dax_dev)
+		return NULL;
+
 	inode = &dax_dev->inode;
 	inode->i_rdev = 0;
 	return inode;

--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
 libnvdimm-y += namespace_devs.o
 libnvdimm-y += label.o
+libnvdimm-y += badrange.o
 libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
 libnvdimm-$(CONFIG_BTT) += btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o

--- a/drivers/nvdimm/badrange.c
+++ b/drivers/nvdimm/badrange.c
+/*
+ * Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+#include <linux/libnvdimm.h>
+#include <linux/badblocks.h>
+#include <linux/export.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/device.h>
+#include <linux/ctype.h>
+#include <linux/ndctl.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include "nd-core.h"
+#include "nd.h"
+
+void badrange_init(struct badrange *badrange)
+{
+	INIT_LIST_HEAD(&badrange->list);
+	spin_lock_init(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_init);
+
+static void append_badrange_entry(struct badrange *badrange,
+		struct badrange_entry *bre, u64 addr, u64 length)
+{
+	lockdep_assert_held(&badrange->lock);
+	bre->start = addr;
+	bre->length = length;
+	list_add_tail(&bre->list, &badrange->list);
+}
+
+static int alloc_and_append_badrange_entry(struct badrange *badrange,
+		u64 addr, u64 length, gfp_t flags)
+{
+	struct badrange_entry *bre;
+
+	bre = kzalloc(sizeof(*bre), flags);
+	if (!bre)
+		return -ENOMEM;
+
+	append_badrange_entry(badrange, bre, addr, length);
+	return 0;
+}
+
+static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
+{
+	struct badrange_entry *bre, *bre_new;
+
+	spin_unlock(&badrange->lock);
+	bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
+	spin_lock(&badrange->lock);
+
+	if (list_empty(&badrange->list)) {
+		if (!bre_new)
+			return -ENOMEM;
+		append_badrange_entry(badrange, bre_new, addr, length);
+		return 0;
+	}
+
+	/*
+	 * There is a chance this is a duplicate, check for those first.
+	 * This will be the common case as ARS_STATUS returns all known
+	 * errors in the SPA space, and we can't query it per region
+	 */
+	list_for_each_entry(bre, &badrange->list, list)
+		if (bre->start == addr) {
+			/* If length has changed, update this list entry */
+			if (bre->length != length)
+				bre->length = length;
+			kfree(bre_new);
+			return 0;
+		}
+
+	/*
+	 * If not a duplicate or a simple length update, add the entry as is,
+	 * as any overlapping ranges will get resolved when the list is consumed
+	 * and converted to badblocks
+	 */
+	if (!bre_new)
+		return -ENOMEM;
+	append_badrange_entry(badrange, bre_new, addr, length);
+
+	return 0;
+}
+
+int badrange_add(struct badrange *badrange, u64 addr, u64 length)
+{
+	int rc;
+
+	spin_lock(&badrange->lock);
+	rc = add_badrange(badrange, addr, length);
+	spin_unlock(&badrange->lock);
+
+	return rc;
+}
+EXPORT_SYMBOL_GPL(badrange_add);
+
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len)
+{
+	struct list_head *badrange_list = &badrange->list;
+	u64 clr_end = start + len - 1;
+	struct badrange_entry *bre, *next;
+
+	spin_lock(&badrange->lock);
+
+	/*
+	 * [start, clr_end] is the badrange interval being cleared.
+	 * [bre->start, bre_end] is the badrange_list entry we're comparing
+	 * the above interval against. The badrange list entry may need
+	 * to be modified (update either start or length), deleted, or
+	 * split into two based on the overlap characteristics
+	 */
+
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Skip intervals with no intersection */
+		if (bre_end < start)
+			continue;
+		if (bre->start >  clr_end)
+			continue;
+		/* Delete completely overlapped badrange entries */
+		if ((bre->start >= start) && (bre_end <= clr_end)) {
+			list_del(&bre->list);
+			kfree(bre);
+			continue;
+		}
+		/* Adjust start point of partially cleared entries */
+		if ((start <= bre->start) && (clr_end > bre->start)) {
+			bre->length -= clr_end - bre->start + 1;
+			bre->start = clr_end + 1;
+			continue;
+		}
+		/* Adjust bre->length for partial clearing at the tail end */
+		if ((bre->start < start) && (bre_end <= clr_end)) {
+			/* bre->start remains the same */
+			bre->length = start - bre->start;
+			continue;
+		}
+		/*
+		 * If clearing in the middle of an entry, we split it into
+		 * two by modifying the current entry to represent one half of
+		 * the split, and adding a new entry for the second half.
+		 */
+		if ((bre->start < start) && (bre_end > clr_end)) {
+			u64 new_start = clr_end + 1;
+			u64 new_len = bre_end - new_start + 1;
+
+			/* Add new entry covering the right half */
+			alloc_and_append_badrange_entry(badrange, new_start,
+					new_len, GFP_NOWAIT);
+			/* Adjust this entry to cover the left half */
+			bre->length = start - bre->start;
+			continue;
+		}
+	}
+	spin_unlock(&badrange->lock);
+}
+EXPORT_SYMBOL_GPL(badrange_forget);
+
+static void set_badblock(struct badblocks *bb, sector_t s, int num)
+{
+	dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
+			(u64) s * 512, (u64) num * 512);
+	/* this isn't an error as the hardware will still throw an exception */
+	if (badblocks_set(bb, s, num, 1))
+		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
+				__func__, (u64) s);
+}
+
+/**
+ * __add_badblock_range() - Convert a physical address range to bad sectors
+ * @bb:		badblocks instance to populate
+ * @ns_offset:	namespace offset where the error range begins (in bytes)
+ * @len:	number of bytes of badrange to be added
+ *
+ * This assumes that the range provided with (ns_offset, len) is within
+ * the bounds of physical addresses for this namespace, i.e. lies in the
+ * interval [ns_start, ns_start + ns_size)
+ */
+static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
+{
+	const unsigned int sector_size = 512;
+	sector_t start_sector, end_sector;
+	u64 num_sectors;
+	u32 rem;
+
+	start_sector = div_u64(ns_offset, sector_size);
+	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
+	if (rem)
+		end_sector++;
+	num_sectors = end_sector - start_sector;
+
+	if (unlikely(num_sectors > (u64)INT_MAX)) {
+		u64 remaining = num_sectors;
+		sector_t s = start_sector;
+
+		while (remaining) {
+			int done = min_t(u64, remaining, INT_MAX);
+
+			set_badblock(bb, s, done);
+			remaining -= done;
+			s += done;
+		}
+	} else
+		set_badblock(bb, start_sector, num_sectors);
+}
+
+static void badblocks_populate(struct badrange *badrange,
+		struct badblocks *bb, const struct resource *res)
+{
+	struct badrange_entry *bre;
+
+	if (list_empty(&badrange->list))
+		return;
+
+	list_for_each_entry(bre, &badrange->list, list) {
+		u64 bre_end = bre->start + bre->length - 1;
+
+		/* Discard intervals with no intersection */
+		if (bre_end < res->start)
+			continue;
+		if (bre->start >  res->end)
+			continue;
+		/* Deal with any overlap after start of the namespace */
+		if (bre->start >= res->start) {
+			u64 start = bre->start;
+			u64 len;
+
+			if (bre_end <= res->end)
+				len = bre->length;
+			else
+				len = res->start + resource_size(res)
+					- bre->start;
+			__add_badblock_range(bb, start - res->start, len);
+			continue;
+		}
+		/*
+		 * Deal with overlap for badrange starting before
+		 * the namespace.
+		 */
+		if (bre->start < res->start) {
+			u64 len;
+
+			if (bre_end < res->end)
+				len = bre->start + bre->length - res->start;
+			else
+				len = resource_size(res);
+			__add_badblock_range(bb, 0, len);
+		}
+	}
+}
+
+/**
+ * nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
+ * @region: parent region of the range to interrogate
+ * @bb: badblocks instance to populate
+ * @res: resource range to consider
+ *
+ * The badrange list generated during bus initialization may contain
+ * multiple, possibly overlapping physical address ranges.  Compare each
+ * of these ranges to the resource range currently being initialized,
+ * and add badblocks entries for all matching sub-ranges
+ */
+void nvdimm_badblocks_populate(struct nd_region *nd_region,
+		struct badblocks *bb, const struct resource *res)
+{
+	struct nvdimm_bus *nvdimm_bus;
+
+	if (!is_memory(&nd_region->dev)) {
+		dev_WARN_ONCE(&nd_region->dev, 1,
+				"%s only valid for pmem regions\n", __func__);
+		return;
+	}
+	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
+
+	nvdimm_bus_lock(&nvdimm_bus->dev);
+	badblocks_populate(&nvdimm_bus->badrange, bb, res);
+	nvdimm_bus_unlock(&nvdimm_bus->dev);
+}
+EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -11,6 +11,7 @@
 * General Public License for more details.
 */
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/libnvdimm.h>
 #include <linux/sched/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
@@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
 		phys_addr_t phys, u64 cleared)
 {
 	if (cleared > 0)
-		nvdimm_forget_poison(nvdimm_bus, phys, cleared);
+		badrange_forget(&nvdimm_bus->badrange, phys, cleared);

 	if (cleared > 0 && cleared / 512)
 		nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
@@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		return NULL;
 	INIT_LIST_HEAD(&nvdimm_bus->list);
 	INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
-	INIT_LIST_HEAD(&nvdimm_bus->poison_list);
 	init_waitqueue_head(&nvdimm_bus->probe_wait);
 	nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
 	mutex_init(&nvdimm_bus->reconfig_mutex);
-	spin_lock_init(&nvdimm_bus->poison_lock);
+	badrange_init(&nvdimm_bus->badrange);
 	if (nvdimm_bus->id < 0) {
 		kfree(nvdimm_bus);
 		return NULL;
@@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
 	return 0;
 }

-static void free_poison_list(struct list_head *poison_list)
+static void free_badrange_list(struct list_head *badrange_list)
 {
-	struct nd_poison *pl, *next;
+	struct badrange_entry *bre, *next;

-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		list_del(&pl->list);
-		kfree(pl);
+	list_for_each_entry_safe(bre, next, badrange_list, list) {
+		list_del(&bre->list);
+		kfree(bre);
 	}
-	list_del_init(poison_list);
+	list_del_init(badrange_list);
 }

 static int nd_bus_remove(struct device *dev)
@@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
 	nd_synchronize();
 	device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);

-	spin_lock(&nvdimm_bus->poison_lock);
-	free_poison_list(&nvdimm_bus->poison_list);
-	spin_unlock(&nvdimm_bus->poison_lock);
+	spin_lock(&nvdimm_bus->badrange.lock);
+	free_badrange_list(&nvdimm_bus->badrange.list);
+	spin_unlock(&nvdimm_bus->badrange.lock);

 	nvdimm_bus_destroy_ndctl(nvdimm_bus);


--- a/drivers/nvdimm/core.c
+++ b/drivers/nvdimm/core.c
@@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
 };
 EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);

-static void set_badblock(struct badblocks *bb, sector_t s, int num)
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
 {
-	dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n",
-			(u64) s * 512, (u64) num * 512);
-	/* this isn't an error as the hardware will still throw an exception */
-	if (badblocks_set(bb, s, num, 1))
-		dev_info_once(bb->dev, "%s: failed for sector %llx\n",
-				__func__, (u64) s);
+	return badrange_add(&nvdimm_bus->badrange, addr, length);
 }
-
-/**
- * __add_badblock_range() - Convert a physical address range to bad sectors
- * @bb:		badblocks instance to populate
- * @ns_offset:	namespace offset where the error range begins (in bytes)
- * @len:	number of bytes of poison to be added
- *
- * This assumes that the range provided with (ns_offset, len) is within
- * the bounds of physical addresses for this namespace, i.e. lies in the
- * interval [ns_start, ns_start + ns_size)
- */
-static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
-{
-	const unsigned int sector_size = 512;
-	sector_t start_sector, end_sector;
-	u64 num_sectors;
-	u32 rem;
-
-	start_sector = div_u64(ns_offset, sector_size);
-	end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
-	if (rem)
-		end_sector++;
-	num_sectors = end_sector - start_sector;
-
-	if (unlikely(num_sectors > (u64)INT_MAX)) {
-		u64 remaining = num_sectors;
-		sector_t s = start_sector;
-
-		while (remaining) {
-			int done = min_t(u64, remaining, INT_MAX);
-
-			set_badblock(bb, s, done);
-			remaining -= done;
-			s += done;
-		}
-	} else
-		set_badblock(bb, start_sector, num_sectors);
-}
-
-static void badblocks_populate(struct list_head *poison_list,
-		struct badblocks *bb, const struct resource *res)
-{
-	struct nd_poison *pl;
-
-	if (list_empty(poison_list))
-		return;
-
-	list_for_each_entry(pl, poison_list, list) {
-		u64 pl_end = pl->start + pl->length - 1;
-
-		/* Discard intervals with no intersection */
-		if (pl_end < res->start)
-			continue;
-		if (pl->start >  res->end)
-			continue;
-		/* Deal with any overlap after start of the namespace */
-		if (pl->start >= res->start) {
-			u64 start = pl->start;
-			u64 len;
-
-			if (pl_end <= res->end)
-				len = pl->length;
-			else
-				len = res->start + resource_size(res)
-					- pl->start;
-			__add_badblock_range(bb, start - res->start, len);
-			continue;
-		}
-		/* Deal with overlap for poison starting before the namespace */
-		if (pl->start < res->start) {
-			u64 len;
-
-			if (pl_end < res->end)
-				len = pl->start + pl->length - res->start;
-			else
-				len = resource_size(res);
-			__add_badblock_range(bb, 0, len);
-		}
-	}
-}
-
-/**
- * nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
- * @region: parent region of the range to interrogate
- * @bb: badblocks instance to populate
- * @res: resource range to consider
- *
- * The poison list generated during bus initialization may contain
- * multiple, possibly overlapping physical address ranges.  Compare each
- * of these ranges to the resource range currently being initialized,
- * and add badblocks entries for all matching sub-ranges
- */
-void nvdimm_badblocks_populate(struct nd_region *nd_region,
-		struct badblocks *bb, const struct resource *res)
-{
-	struct nvdimm_bus *nvdimm_bus;
-	struct list_head *poison_list;
-
-	if (!is_memory(&nd_region->dev)) {
-		dev_WARN_ONCE(&nd_region->dev, 1,
-				"%s only valid for pmem regions\n", __func__);
-		return;
-	}
-	nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
-	poison_list = &nvdimm_bus->poison_list;
-
-	nvdimm_bus_lock(&nvdimm_bus->dev);
-	badblocks_populate(poison_list, bb, res);
-	nvdimm_bus_unlock(&nvdimm_bus->dev);
-}
-EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
-
-static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
-		struct nd_poison *pl, u64 addr, u64 length)
-{
-	lockdep_assert_held(&nvdimm_bus->poison_lock);
-	pl->start = addr;
-	pl->length = length;
-	list_add_tail(&pl->list, &nvdimm_bus->poison_list);
-}
-
-static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
-			gfp_t flags)
-{
-	struct nd_poison *pl;
-
-	pl = kzalloc(sizeof(*pl), flags);
-	if (!pl)
-		return -ENOMEM;
-
-	append_poison_entry(nvdimm_bus, pl, addr, length);
-	return 0;
-}
-
-static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-	struct nd_poison *pl, *pl_new;
-
-	spin_unlock(&nvdimm_bus->poison_lock);
-	pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
-	spin_lock(&nvdimm_bus->poison_lock);
-
-	if (list_empty(&nvdimm_bus->poison_list)) {
-		if (!pl_new)
-			return -ENOMEM;
-		append_poison_entry(nvdimm_bus, pl_new, addr, length);
-		return 0;
-	}
-
-	/*
-	 * There is a chance this is a duplicate, check for those first.
-	 * This will be the common case as ARS_STATUS returns all known
-	 * errors in the SPA space, and we can't query it per region
-	 */
-	list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
-		if (pl->start == addr) {
-			/* If length has changed, update this list entry */
-			if (pl->length != length)
-				pl->length = length;
-			kfree(pl_new);
-			return 0;
-		}
-
-	/*
-	 * If not a duplicate or a simple length update, add the entry as is,
-	 * as any overlapping ranges will get resolved when the list is consumed
-	 * and converted to badblocks
-	 */
-	if (!pl_new)
-		return -ENOMEM;
-	append_poison_entry(nvdimm_bus, pl_new, addr, length);
-
-	return 0;
-}
-
-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
-{
-	int rc;
-
-	spin_lock(&nvdimm_bus->poison_lock);
-	rc = bus_add_poison(nvdimm_bus, addr, length);
-	spin_unlock(&nvdimm_bus->poison_lock);
-
-	return rc;
-}
-EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
-
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
-		unsigned int len)
-{
-	struct list_head *poison_list = &nvdimm_bus->poison_list;
-	u64 clr_end = start + len - 1;
-	struct nd_poison *pl, *next;
-
-	spin_lock(&nvdimm_bus->poison_lock);
-	WARN_ON_ONCE(list_empty(poison_list));
-
-	/*
-	 * [start, clr_end] is the poison interval being cleared.
-	 * [pl->start, pl_end] is the poison_list entry we're comparing
-	 * the above interval against. The poison list entry may need
-	 * to be modified (update either start or length), deleted, or
-	 * split into two based on the overlap characteristics
-	 */
-
-	list_for_each_entry_safe(pl, next, poison_list, list) {
-		u64 pl_end = pl->start + pl->length - 1;
-
-		/* Skip intervals with no intersection */
-		if (pl_end < start)
-			continue;
-		if (pl->start >  clr_end)
-			continue;
-		/* Delete completely overlapped poison entries */
-		if ((pl->start >= start) && (pl_end <= clr_end)) {
-			list_del(&pl->list);
-			kfree(pl);
-			continue;
-		}
-		/* Adjust start point of partially cleared entries */
-		if ((start <= pl->start) && (clr_end > pl->start)) {
-			pl->length -= clr_end - pl->start + 1;
-			pl->start = clr_end + 1;
-			continue;
-		}
-		/* Adjust pl->length for partial clearing at the tail end */
-		if ((pl->start < start) && (pl_end <= clr_end)) {
-			/* pl->start remains the same */
-			pl->length = start - pl->start;
-			continue;
-		}
-		/*
-		 * If clearing in the middle of an entry, we split it into
-		 * two by modifying the current entry to represent one half of
-		 * the split, and adding a new entry for the second half.
-		 */
-		if ((pl->start < start) && (pl_end > clr_end)) {
-			u64 new_start = clr_end + 1;
-			u64 new_len = pl_end - new_start + 1;
-
-			/* Add new entry covering the right half */
-			add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
-			/* Adjust this entry to cover the left half */
-			pl->length = start - pl->start;
-			continue;
-		}
-	}
-	spin_unlock(&nvdimm_bus->poison_lock);
-}
-EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
+EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);

 #ifdef CONFIG_BLK_DEV_INTEGRITY
 int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)

--- a/drivers/nvdimm/dimm.c
+++ b/drivers/nvdimm/dimm.c
@@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev)
 		goto err;

 	rc = nvdimm_init_config_data(ndd);
+	if (rc == -EACCES)
+		nvdimm_set_locked(dev);
 	if (rc)
 		goto err;

@@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev)
 	rc = nd_label_reserve_dpa(ndd);
 	if (ndd->ns_current >= 0)
 		nvdimm_set_aliasing(dev);
+	nvdimm_clear_locked(dev);
 	nvdimm_bus_unlock(dev);

 	if (rc)

--- a/drivers/nvdimm/dimm_devs.c
+++ b/drivers/nvdimm/dimm_devs.c
@@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev)
 	set_bit(NDD_LOCKED, &nvdimm->flags);
 }

+void nvdimm_clear_locked(struct device *dev)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	clear_bit(NDD_LOCKED, &nvdimm->flags);
+}
+
 static void nvdimm_release(struct device *dev)
 {
 	struct nvdimm *nvdimm = to_nvdimm(dev);
@@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(commands);

+static ssize_t flags_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvdimm *nvdimm = to_nvdimm(dev);
+
+	return sprintf(buf, "%s%s\n",
+			test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
+			test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
+}
+static DEVICE_ATTR_RO(flags);
+
 static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
@@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots);

 static struct attribute *nvdimm_attributes[] = {
 	&dev_attr_state.attr,
+	&dev_attr_flags.attr,
 	&dev_attr_commands.attr,
 	&dev_attr_available_slots.attr,
 	NULL,

--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
 	nsindex = to_namespace_index(ndd, 0);
 	memset(nsindex, 0, ndd->nsarea.config_size);
 	for (i = 0; i < 2; i++) {
-		int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT);
+		int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);

 		if (rc)
 			return rc;

--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj,
 	if (a == &dev_attr_resource.attr) {
 		if (is_namespace_blk(dev))
 			return 0;
-		return a->mode;
+		return 0400;
 	}

 	if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
@@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
 * @nspm: target namespace to create
 * @nd_label: target pmem namespace label to evaluate
 */
-struct device *create_namespace_pmem(struct nd_region *nd_region,
+static struct device *create_namespace_pmem(struct nd_region *nd_region,
 		struct nd_namespace_index *nsindex,
 		struct nd_namespace_label *nd_label)
 {
@@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region,
 	return i;
 }

-struct device *create_namespace_blk(struct nd_region *nd_region,
+static struct device *create_namespace_blk(struct nd_region *nd_region,
 		struct nd_namespace_label *nd_label, int count)
 {


--- a/drivers/nvdimm/nd-core.h
+++ b/drivers/nvdimm/nd-core.h
@@ -29,10 +29,9 @@ struct nvdimm_bus {
 	struct list_head list;
 	struct device dev;
 	int id, probe_active;
-	struct list_head poison_list;
 	struct list_head mapping_list;
 	struct mutex reconfig_mutex;
-	spinlock_t poison_lock;
+	struct badrange badrange;
 };

 struct nvdimm {

--- a/drivers/nvdimm/nd.h
+++ b/drivers/nvdimm/nd.h
@@ -34,12 +34,6 @@ enum {
 	NVDIMM_IO_ATOMIC = 1,
 };

-struct nd_poison {
-	u64 start;
-	u64 length;
-	struct list_head list;
-};
-
 struct nvdimm_drvdata {
 	struct device *dev;
 	int nslabel_size;
@@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
 		unsigned int len);
 void nvdimm_set_aliasing(struct device *dev);
 void nvdimm_set_locked(struct device *dev);
+void nvdimm_clear_locked(struct device *dev);
 struct nd_btt *to_nd_btt(struct device *dev);

 struct nd_gen_sb {

--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = {
 	NULL,
 };

+static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	if (a == &dev_attr_resource.attr)
+		return 0400;
+	return a->mode;
+}
+
 struct attribute_group nd_pfn_attribute_group = {
 	.attrs = nd_pfn_attributes,
+	.is_visible = pfn_visible,
 };

 static const struct attribute_group *nd_pfn_attribute_groups[] = {

--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
 	if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
 		return 0;

-	if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr)
-		return 0;
+	if (a == &dev_attr_resource.attr) {
+		if (is_nd_pmem(dev))
+			return 0400;
+		else
+			return 0;
+	}

 	if (a == &dev_attr_deep_flush.attr) {
 		int has_flush = nvdimm_has_flush(nd_region);

--- a/fs/dax.c
+++ b/fs/dax.c
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);

-	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);

 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)

--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -28,6 +28,7 @@
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/uio.h>
+#include <linux/mman.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 	 */
 	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
 		(vmf->vma->vm_flags & VM_SHARED);
+	pfn_t pfn;

 	if (write) {
 		sb_start_pagefault(sb);
@@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 					       EXT4_DATA_TRANS_BLOCKS(sb));
+		if (IS_ERR(handle)) {
+			up_read(&EXT4_I(inode)->i_mmap_sem);
+			sb_end_pagefault(sb);
+			return VM_FAULT_SIGBUS;
+		}
 	} else {
 		down_read(&EXT4_I(inode)->i_mmap_sem);
 	}
-	if (!IS_ERR(handle))
-		result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
-	else
-		result = VM_FAULT_SIGBUS;
+	result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
 	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
+		ext4_journal_stop(handle);
+		/* Handling synchronous page fault? */
+		if (result & VM_FAULT_NEEDDSYNC)
+			result = dax_finish_sync_fault(vmf, pe_size, pfn);
 		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
 	} else {
@@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
 		return -EIO;

+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
 	.mmap		= ext4_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= ext4_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,

--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 		return try_to_free_buffers(page);
 }

+static bool ext4_inode_datasync_dirty(struct inode *inode)
+{
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
+	if (journal)
+		return !jbd2_transaction_committed(journal,
+					EXT4_I(inode)->i_datasync_tid);
+	/* Any metadata buffers to write? */
+	if (!list_empty(&inode->i_mapping->private_list))
+		return true;
+	return inode->i_state & I_DIRTY_DATASYNC;
+}
+
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -3497,6 +3510,8 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	}

 	iomap->flags = 0;
+	if (ext4_inode_datasync_dirty(inode))
+		iomap->flags |= IOMAP_F_DIRTY;
 	iomap->bdev = inode->i_sb->s_bdev;
 	iomap->dax_dev = sbi->s_daxdev;
 	iomap->offset = first_block << blkbits;

--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }

+/* Return 1 when transaction with given tid has already committed. */
+int jbd2_transaction_committed(journal_t *journal, tid_t tid)
+{
+	int ret = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid)
+		ret = 0;
+	if (journal->j_committing_transaction &&
+	    journal->j_committing_transaction->t_tid == tid)
+		ret = 0;
+	read_unlock(&journal->j_state_lock);
+	return ret;
+}
+EXPORT_SYMBOL(jbd2_transaction_committed);
+
 /*
 * When this function returns the transaction corresponding to tid
 * will be completed.  If the transaction has currently running, start

--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
+		[ilog2(VM_SYNC)]	= "sf",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_WIPEONFORK)]	= "wf",
 		[ilog2(VM_DONTDUMP)]	= "dd",

--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -44,6 +44,7 @@
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
+#include <linux/mman.h>

 static const struct vm_operations_struct xfs_file_vm_ops;

@@ -1045,7 +1046,11 @@ __xfs_filemap_fault(

 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 	if (IS_DAX(inode)) {
-		ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops);
+		pfn_t pfn;
+
+		ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
+		if (ret & VM_FAULT_NEEDDSYNC)
+			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
 	} else {
 		if (write_fault)
 			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
 }

 /*
- * pfn_mkwrite was originally inteneded to ensure we capture time stamp
- * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
- * to ensure we serialise the fault barrier in place.
+ * pfn_mkwrite was originally intended to ensure we capture time stamp updates
+ * on write faults. In reality, it needs to serialise against truncate and
+ * prepare memory for writing so handle is as standard write fault.
 */
 static int
 xfs_filemap_pfn_mkwrite(
 	struct vm_fault		*vmf)
 {

-	struct inode		*inode = file_inode(vmf->vma->vm_file);
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ret = VM_FAULT_NOPAGE;
-	loff_t			size;
-
-	trace_xfs_filemap_pfn_mkwrite(ip);
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vmf->vma->vm_file);
-
-	/* check if the faulting page hasn't raced with truncate */
-	xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		ret = VM_FAULT_SIGBUS;
-	else if (IS_DAX(inode))
-		ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
-	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-
+	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
 }

 static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1136,6 +1120,13 @@ xfs_file_mmap(
 	struct file	*filp,
 	struct vm_area_struct *vma)
 {
+	/*
+	 * We don't support synchronous mappings for non-DAX files. At least
+	 * until someone comes with a sensible use case.
+	 */
+	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
+		return -EOPNOTSUPP;
+
 	file_accessed(filp);
 	vma->vm_ops = &xfs_file_vm_ops;
 	if (IS_DAX(file_inode(filp)))
@@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
 	.compat_ioctl	= xfs_file_compat_ioctl,
 #endif
 	.mmap		= xfs_file_mmap,
+	.mmap_supported_flags = MAP_SYNC,
 	.open		= xfs_file_open,
 	.release	= xfs_file_release,
 	.fsync		= xfs_file_fsync,

--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_trans.h"
 #include "xfs_trans_space.h"
+#include "xfs_inode_item.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
@@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
 		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
 	}

+	if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
+				& ~XFS_ILOG_TIMESTAMP))
+		iomap->flags |= IOMAP_F_DIRTY;
+
 	xfs_bmbt_to_iomap(ip, iomap, &imap);

 	if (shared)

--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);

-DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
-
 TRACE_EVENT(xfs_filemap_fault,
 	TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
 		 bool write_fault),

--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -96,7 +96,9 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-		    const struct iomap_ops *ops);
+		    pfn_t *pfnp, const struct iomap_ops *ops);
+int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
+			  pfn_t pfn);
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
 int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
 				      pgoff_t index);

--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1702,6 +1702,7 @@ struct file_operations {
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
+	unsigned long mmap_supported_flags;
 	int (*open) (struct inode *, struct file *);
 	int (*flush) (struct file *, fl_owner_t id);
 	int (*release) (struct inode *, struct file *);

--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -21,9 +21,13 @@ struct vm_fault;

 /*
 * Flags for all iomap mappings:
+ *
+ * IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
+ * written data and requires fdatasync to commit them to persistent storage.
 */
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
 #define IOMAP_F_BOUNDARY	0x02	/* mapping ends at metadata boundary */
+#define IOMAP_F_DIRTY		0x04	/* uncommitted metadata */

 /*
 * Flags that only need to be reported for IOMAP_REPORT requests:

--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_transaction_committed(journal_t *journal, tid_t tid);
 int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

--- a/include/linux/libnvdimm.h
+++ b/include/linux/libnvdimm.h
@@ -18,6 +18,18 @@
 #include <linux/sizes.h>
 #include <linux/types.h>
 #include <linux/uuid.h>
+#include <linux/spinlock.h>
+
+struct badrange_entry {
+	u64 start;
+	u64 length;
+	struct list_head list;
+};
+
+struct badrange {
+	struct list_head list;
+	spinlock_t lock;
+};

 enum {
 	/* when a dimm supports both PMEM and BLK access a label is required */
@@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(

 }

-int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length);
-void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus,
-		phys_addr_t start, unsigned int len);
+void badrange_init(struct badrange *badrange);
+int badrange_add(struct badrange *badrange, u64 addr, u64 length);
+void badrange_forget(struct badrange *badrange, phys_addr_t start,
+		unsigned int len);
+int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
+		u64 length);
 struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
 		struct nvdimm_bus_descriptor *nfit_desc);
 void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);

--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -199,6 +199,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
+#define VM_SYNC		0x00800000	/* Synchronous page faults */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_WIPEONFORK	0x02000000	/* Wipe VMA contents in child. */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
@@ -1191,8 +1192,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
 #define VM_FAULT_DONE_COW   0x1000	/* ->fault has fully handled COW */
-
-#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
+#define VM_FAULT_NEEDDSYNC  0x2000	/* ->fault did not modify page tables
+					 * and needs fsync() to complete (for
+					 * synchronous page faults in DAX) */

 #define VM_FAULT_ERROR	(VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
 			 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@@ -1210,7 +1212,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
 	{ VM_FAULT_LOCKED,		"LOCKED" }, \
 	{ VM_FAULT_RETRY,		"RETRY" }, \
 	{ VM_FAULT_FALLBACK,		"FALLBACK" }, \
-	{ VM_FAULT_DONE_COW,		"DONE_COW" }
+	{ VM_FAULT_DONE_COW,		"DONE_COW" }, \
+	{ VM_FAULT_NEEDDSYNC,		"NEEDDSYNC" }

 /* Encode hstate index for a hwpoisoned large page */
 #define VM_FAULT_SET_HINDEX(x) ((x) << 12)

--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -8,6 +8,48 @@
 #include <linux/atomic.h>
 #include <uapi/linux/mman.h>

+/*
+ * Arrange for legacy / undefined architecture specific flags to be
+ * ignored by mmap handling code.
+ */
+#ifndef MAP_32BIT
+#define MAP_32BIT 0
+#endif
+#ifndef MAP_HUGE_2MB
+#define MAP_HUGE_2MB 0
+#endif
+#ifndef MAP_HUGE_1GB
+#define MAP_HUGE_1GB 0
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+#ifndef MAP_SYNC
+#define MAP_SYNC 0
+#endif
+
+/*
+ * The historical set of flags that all mmap implementations implicitly
+ * support when a ->mmap_validate() op is not provided in file_operations.
+ */
+#define LEGACY_MAP_MASK (MAP_SHARED \
+		| MAP_PRIVATE \
+		| MAP_FIXED \
+		| MAP_ANONYMOUS \
+		| MAP_DENYWRITE \
+		| MAP_EXECUTABLE \
+		| MAP_UNINITIALIZED \
+		| MAP_GROWSDOWN \
+		| MAP_LOCKED \
+		| MAP_NORESERVE \
+		| MAP_POPULATE \
+		| MAP_NONBLOCK \
+		| MAP_STACK \
+		| MAP_HUGETLB \
+		| MAP_32BIT \
+		| MAP_HUGE_2MB \
+		| MAP_HUGE_1GB)
+
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
 extern unsigned long sysctl_overcommit_kbytes;
@@ -64,8 +106,9 @@ static inline bool arch_validate_prot(unsigned long prot)
 * ("bit1" and "bit2" must be single bits)
 */
 #define _calc_vm_trans(x, bit1, bit2) \
+  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
-   : ((x) & (bit1)) / ((bit1) / (bit2)))
+   : ((x) & (bit1)) / ((bit1) / (bit2))))

 /*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
@@ -87,7 +130,8 @@ calc_vm_flag_bits(unsigned long flags)
 {
 	return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
 	       _calc_vm_trans(flags, MAP_DENYWRITE,  VM_DENYWRITE ) |
-	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    );
+	       _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
+	       _calc_vm_trans(flags, MAP_SYNC,	     VM_SYNC      );
 }

 unsigned long vm_commit_limit(void);

--- a/include/trace/events/fs_dax.h
+++ b/include/trace/events/fs_dax.h
@@ -149,7 +149,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
 	TP_ARGS(inode, vmf, length, pfn, radix_entry))

 DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
-DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);

 DECLARE_EVENT_CLASS(dax_pte_fault_class,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
@@ -192,6 +191,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
 DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
 DEFINE_PTE_FAULT_EVENT(dax_load_hole);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
+DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);

 TRACE_EVENT(dax_insert_mapping,
 	TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),

--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -17,6 +17,7 @@

 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */

--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -13,6 +13,7 @@
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
 #define MAP_STACK	0x20000		/* give out an address that is best suited for process/thread stacks */
 #define MAP_HUGETLB	0x40000		/* create a huge page mapping */
+#define MAP_SYNC	0x80000		/* perform synchronous page faults for the mapping */

 /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */


--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,

 	if (file) {
 		struct inode *inode = file_inode(file);
+		unsigned long flags_mask;
+
+		flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;

 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
+			/*
+			 * Force use of MAP_SHARED_VALIDATE with non-legacy
+			 * flags. E.g. MAP_SYNC is dangerous to use with
+			 * MAP_SHARED as you don't know which consistency model
+			 * you will get. We silently ignore unsupported flags
+			 * with MAP_SHARED to preserve backward compatibility.
+			 */
+			flags &= LEGACY_MAP_MASK;
+			/* fall through */
+		case MAP_SHARED_VALIDATE:
+			if (flags & ~flags_mask)
+				return -EOPNOTSUPP;
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
 				return -EACCES;


--- a/tools/include/uapi/asm-generic/mman-common.h
+++ b/tools/include/uapi/asm-generic/mman-common.h
@@ -17,6 +17,7 @@

 #define MAP_SHARED	0x01		/* Share changes */
 #define MAP_PRIVATE	0x02		/* Changes are private */
+#define MAP_SHARED_VALIDATE 0x03	/* share + validate extension flags */
 #define MAP_TYPE	0x0f		/* Mask for type of mapping */
 #define MAP_FIXED	0x10		/* Interpret addr exactly */
 #define MAP_ANONYMOUS	0x20		/* don't use a file */

--- a/tools/testing/nvdimm/Kbuild
+++ b/tools/testing/nvdimm/Kbuild
@@ -70,6 +70,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/region.o
 libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
 libnvdimm-y += $(NVDIMM_SRC)/label.o
+libnvdimm-y += $(NVDIMM_SRC)/badrange.o
 libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
 libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
 libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o

--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
--- a/tools/testing/nvdimm/test/nfit_test.h
+++ b/tools/testing/nvdimm/test/nfit_test.h
@@ -32,6 +32,58 @@ struct nfit_test_resource {
 	void *buf;
 };

+#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA  2
+#define NFIT_ARS_INJECT_INVALID 2
+
+enum err_inj_options {
+	ND_ARS_ERR_INJ_OPT_NOTIFY = 0,
+};
+
+/* nfit commands */
+enum nfit_cmd_num {
+	NFIT_CMD_TRANSLATE_SPA = 5,
+	NFIT_CMD_ARS_INJECT_SET = 7,
+	NFIT_CMD_ARS_INJECT_CLEAR = 8,
+	NFIT_CMD_ARS_INJECT_GET = 9,
+};
+
+struct nd_cmd_translate_spa {
+	__u64 spa;
+	__u32 status;
+	__u8  flags;
+	__u8  _reserved[3];
+	__u64 translate_length;
+	__u32 num_nvdimms;
+	struct nd_nvdimm_device {
+		__u32 nfit_device_handle;
+		__u32 _reserved;
+		__u64 dpa;
+	} __packed devices[0];
+
+} __packed;
+
+struct nd_cmd_ars_err_inj {
+	__u64 err_inj_spa_range_base;
+	__u64 err_inj_spa_range_length;
+	__u8  err_inj_options;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_clr {
+	__u64 err_inj_clr_spa_range_base;
+	__u64 err_inj_clr_spa_range_length;
+	__u32 status;
+} __packed;
+
+struct nd_cmd_ars_err_inj_stat {
+	__u32 status;
+	__u32 inj_err_rec_count;
+	struct nd_error_stat_query_record {
+		__u64 err_inj_stat_spa_range_base;
+		__u64 err_inj_stat_spa_range_length;
+	} __packed record[0];
+} __packed;
+
 union acpi_object;
 typedef void *acpi_handle;