Commit 956a4cd2 authored by Dan Williams's avatar Dan Williams

device-dax: switch to srcu, fix rcu_read_lock() vs pte allocation

The following warning triggers with a new unit test that stresses the
device-dax interface.

 ===============================
 [ ERR: suspicious RCU usage.  ]
 4.11.0-rc4+ #1049 Tainted: G           O
 -------------------------------
 ./include/linux/rcupdate.h:521 Illegal context switch in RCU read-side critical section!

 other info that might help us debug this:

 rcu_scheduler_active = 2, debug_locks = 0
 2 locks held by fio/9070:
  #0:  (&mm->mmap_sem){++++++}, at: [<ffffffff8d0739d7>] __do_page_fault+0x167/0x4f0
  #1:  (rcu_read_lock){......}, at: [<ffffffffc03fbd02>] dax_dev_huge_fault+0x32/0x620 [dax]

 Call Trace:
  dump_stack+0x86/0xc3
  lockdep_rcu_suspicious+0xd7/0x110
  ___might_sleep+0xac/0x250
  __might_sleep+0x4a/0x80
  __alloc_pages_nodemask+0x23a/0x360
  alloc_pages_current+0xa1/0x1f0
  pte_alloc_one+0x17/0x80
  __pte_alloc+0x1e/0x120
  __get_locked_pte+0x1bf/0x1d0
  insert_pfn.isra.70+0x3a/0x100
  ? lookup_memtype+0xa6/0xd0
  vm_insert_mixed+0x64/0x90
  dax_dev_huge_fault+0x520/0x620 [dax]
  ? dax_dev_huge_fault+0x32/0x620 [dax]
  dax_dev_fault+0x10/0x20 [dax]
  __do_fault+0x1e/0x140
  __handle_mm_fault+0x9af/0x10d0
  handle_mm_fault+0x16d/0x370
  ? handle_mm_fault+0x47/0x370
  __do_page_fault+0x28c/0x4f0
  trace_do_page_fault+0x58/0x2a0
  do_async_page_fault+0x1a/0xa0
  async_page_fault+0x28/0x30

Inserting a page table entry may trigger an allocation while we are
holding a read lock to keep the device instance alive for the duration
of the fault. Use srcu for this keep-alive protection.

Fixes: dee41079 ("/dev/dax, core: file operations and dax-mmap")
Cc: <stable@vger.kernel.org>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent 4aa5615e
...@@ -2,6 +2,7 @@ menuconfig DEV_DAX ...@@ -2,6 +2,7 @@ menuconfig DEV_DAX
tristate "DAX: direct access to differentiated memory" tristate "DAX: direct access to differentiated memory"
default m if NVDIMM_DAX default m if NVDIMM_DAX
depends on TRANSPARENT_HUGEPAGE depends on TRANSPARENT_HUGEPAGE
select SRCU
help help
Support raw access to differentiated (persistence, bandwidth, Support raw access to differentiated (persistence, bandwidth,
latency...) memory via an mmap(2) capable character latency...) memory via an mmap(2) capable character
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "dax.h" #include "dax.h"
static dev_t dax_devt; static dev_t dax_devt;
DEFINE_STATIC_SRCU(dax_srcu);
static struct class *dax_class; static struct class *dax_class;
static DEFINE_IDA(dax_minor_ida); static DEFINE_IDA(dax_minor_ida);
static int nr_dax = CONFIG_NR_DEV_DAX; static int nr_dax = CONFIG_NR_DEV_DAX;
...@@ -60,7 +61,7 @@ struct dax_region { ...@@ -60,7 +61,7 @@ struct dax_region {
* @region - parent region * @region - parent region
* @dev - device backing the character device * @dev - device backing the character device
* @cdev - core chardev data * @cdev - core chardev data
* @alive - !alive + rcu grace period == no new mappings can be established * @alive - !alive + srcu grace period == no new mappings can be established
* @id - child id in the region * @id - child id in the region
* @num_resources - number of physical address extents in this device * @num_resources - number of physical address extents in this device
* @res - array of physical address ranges * @res - array of physical address ranges
...@@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf) ...@@ -569,7 +570,7 @@ static int __dax_dev_pud_fault(struct dax_dev *dax_dev, struct vm_fault *vmf)
static int dax_dev_huge_fault(struct vm_fault *vmf, static int dax_dev_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size) enum page_entry_size pe_size)
{ {
int rc; int rc, id;
struct file *filp = vmf->vma->vm_file; struct file *filp = vmf->vma->vm_file;
struct dax_dev *dax_dev = filp->private_data; struct dax_dev *dax_dev = filp->private_data;
...@@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, ...@@ -578,7 +579,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
? "write" : "read", ? "write" : "read",
vmf->vma->vm_start, vmf->vma->vm_end); vmf->vma->vm_start, vmf->vma->vm_end);
rcu_read_lock(); id = srcu_read_lock(&dax_srcu);
switch (pe_size) { switch (pe_size) {
case PE_SIZE_PTE: case PE_SIZE_PTE:
rc = __dax_dev_pte_fault(dax_dev, vmf); rc = __dax_dev_pte_fault(dax_dev, vmf);
...@@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf, ...@@ -592,7 +593,7 @@ static int dax_dev_huge_fault(struct vm_fault *vmf,
default: default:
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
rcu_read_unlock(); srcu_read_unlock(&dax_srcu, id);
return rc; return rc;
} }
...@@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev) ...@@ -713,11 +714,11 @@ static void unregister_dax_dev(void *dev)
* Note, rcu is not protecting the liveness of dax_dev, rcu is * Note, rcu is not protecting the liveness of dax_dev, rcu is
* ensuring that any fault handlers that might have seen * ensuring that any fault handlers that might have seen
* dax_dev->alive == true, have completed. Any fault handlers * dax_dev->alive == true, have completed. Any fault handlers
* that start after synchronize_rcu() has started will abort * that start after synchronize_srcu() has started will abort
* upon seeing dax_dev->alive == false. * upon seeing dax_dev->alive == false.
*/ */
dax_dev->alive = false; dax_dev->alive = false;
synchronize_rcu(); synchronize_srcu(&dax_srcu);
unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1); unmap_mapping_range(dax_dev->inode->i_mapping, 0, 0, 1);
cdev_del(cdev); cdev_del(cdev);
device_unregister(dev); device_unregister(dev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment