Commit f6944d4a authored by Alex Williamson's avatar Alex Williamson

vfio/pci: Collect hot-reset devices to local buffer

Lockdep reports the below circular locking dependency issue.  The
mmap_lock acquisition while holding pci_bus_sem is due to the use of
copy_to_user() from within a pci_walk_bus() callback.

Building the devices array directly into the user buffer is only for
convenience.  Instead we can allocate a local buffer for the array,
bounded by the number of devices on the bus/slot, fill the device
information into this local buffer, then copy it into the user buffer
outside the bus walk callback.

======================================================
WARNING: possible circular locking dependency detected
6.9.0-rc5+ #39 Not tainted
------------------------------------------------------
CPU 0/KVM/4113 is trying to acquire lock:
ffff99a609ee18a8 (&vdev->vma_lock){+.+.}-{4:4}, at: vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]

but task is already holding lock:
ffff99a243a052a0 (&mm->mmap_lock){++++}-{4:4}, at: vaddr_get_pfns+0x3f/0x170 [vfio_iommu_type1]

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #3 (&mm->mmap_lock){++++}-{4:4}:
       __lock_acquire+0x4e4/0xb90
       lock_acquire+0xbc/0x2d0
       __might_fault+0x5c/0x80
       _copy_to_user+0x1e/0x60
       vfio_pci_fill_devs+0x9f/0x130 [vfio_pci_core]
       vfio_pci_walk_wrapper+0x45/0x60 [vfio_pci_core]
       __pci_walk_bus+0x6b/0xb0
       vfio_pci_ioctl_get_pci_hot_reset_info+0x10b/0x1d0 [vfio_pci_core]
       vfio_pci_core_ioctl+0x1cb/0x400 [vfio_pci_core]
       vfio_device_fops_unl_ioctl+0x7e/0x140 [vfio]
       __x64_sys_ioctl+0x8a/0xc0
       do_syscall_64+0x8d/0x170
       entry_SYSCALL_64_after_hwframe+0x76/0x7e

-> #2 (pci_bus_sem){++++}-{4:4}:
       __lock_acquire+0x4e4/0xb90
       lock_acquire+0xbc/0x2d0
       down_read+0x3e/0x160
       pci_bridge_wait_for_secondary_bus.part.0+0x33/0x2d0
       pci_reset_bus+0xdd/0x160
       vfio_pci_dev_set_hot_reset+0x256/0x270 [vfio_pci_core]
       vfio_pci_ioctl_pci_hot_reset_groups+0x1a3/0x280 [vfio_pci_core]
       vfio_pci_core_ioctl+0x3b5/0x400 [vfio_pci_core]
       vfio_device_fops_unl_ioctl+0x7e/0x140 [vfio]
       __x64_sys_ioctl+0x8a/0xc0
       do_syscall_64+0x8d/0x170
       entry_SYSCALL_64_after_hwframe+0x76/0x7e

-> #1 (&vdev->memory_lock){+.+.}-{4:4}:
       __lock_acquire+0x4e4/0xb90
       lock_acquire+0xbc/0x2d0
       down_write+0x3b/0xc0
       vfio_pci_zap_and_down_write_memory_lock+0x1c/0x30 [vfio_pci_core]
       vfio_basic_config_write+0x281/0x340 [vfio_pci_core]
       vfio_config_do_rw+0x1fa/0x300 [vfio_pci_core]
       vfio_pci_config_rw+0x75/0xe50 [vfio_pci_core]
       vfio_pci_rw+0xea/0x1a0 [vfio_pci_core]
       vfs_write+0xea/0x520
       __x64_sys_pwrite64+0x90/0xc0
       do_syscall_64+0x8d/0x170
       entry_SYSCALL_64_after_hwframe+0x76/0x7e

-> #0 (&vdev->vma_lock){+.+.}-{4:4}:
       check_prev_add+0xeb/0xcc0
       validate_chain+0x465/0x530
       __lock_acquire+0x4e4/0xb90
       lock_acquire+0xbc/0x2d0
       __mutex_lock+0x97/0xde0
       vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]
       __do_fault+0x31/0x160
       do_pte_missing+0x65/0x3b0
       __handle_mm_fault+0x303/0x720
       handle_mm_fault+0x10f/0x460
       fixup_user_fault+0x7f/0x1f0
       follow_fault_pfn+0x66/0x1c0 [vfio_iommu_type1]
       vaddr_get_pfns+0xf2/0x170 [vfio_iommu_type1]
       vfio_pin_pages_remote+0x348/0x4e0 [vfio_iommu_type1]
       vfio_pin_map_dma+0xd2/0x330 [vfio_iommu_type1]
       vfio_dma_do_map+0x2c0/0x440 [vfio_iommu_type1]
       vfio_iommu_type1_ioctl+0xc5/0x1d0 [vfio_iommu_type1]
       __x64_sys_ioctl+0x8a/0xc0
       do_syscall_64+0x8d/0x170
       entry_SYSCALL_64_after_hwframe+0x76/0x7e

other info that might help us debug this:

Chain exists of:
  &vdev->vma_lock --> pci_bus_sem --> &mm->mmap_lock

 Possible unsafe locking scenario:

block dm-0: the capability attribute has been deprecated.
       CPU0                    CPU1
       ----                    ----
  rlock(&mm->mmap_lock);
                               lock(pci_bus_sem);
                               lock(&mm->mmap_lock);
  lock(&vdev->vma_lock);

 *** DEADLOCK ***

2 locks held by CPU 0/KVM/4113:
 #0: ffff99a25f294888 (&iommu->lock#2){+.+.}-{4:4}, at: vfio_dma_do_map+0x60/0x440 [vfio_iommu_type1]
 #1: ffff99a243a052a0 (&mm->mmap_lock){++++}-{4:4}, at: vaddr_get_pfns+0x3f/0x170 [vfio_iommu_type1]

stack backtrace:
CPU: 1 PID: 4113 Comm: CPU 0/KVM Not tainted 6.9.0-rc5+ #39
Hardware name: Dell Inc. PowerEdge T640/04WYPY, BIOS 2.15.1 06/16/2022
Call Trace:
 <TASK>
 dump_stack_lvl+0x64/0xa0
 check_noncircular+0x131/0x150
 check_prev_add+0xeb/0xcc0
 ? add_chain_cache+0x10a/0x2f0
 ? __lock_acquire+0x4e4/0xb90
 validate_chain+0x465/0x530
 __lock_acquire+0x4e4/0xb90
 lock_acquire+0xbc/0x2d0
 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]
 ? lock_is_held_type+0x9a/0x110
 __mutex_lock+0x97/0xde0
 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]
 ? lock_acquire+0xbc/0x2d0
 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]
 ? find_held_lock+0x2b/0x80
 ? vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]
 vfio_pci_mmap_fault+0x35/0x1a0 [vfio_pci_core]
 __do_fault+0x31/0x160
 do_pte_missing+0x65/0x3b0
 __handle_mm_fault+0x303/0x720
 handle_mm_fault+0x10f/0x460
 fixup_user_fault+0x7f/0x1f0
 follow_fault_pfn+0x66/0x1c0 [vfio_iommu_type1]
 vaddr_get_pfns+0xf2/0x170 [vfio_iommu_type1]
 vfio_pin_pages_remote+0x348/0x4e0 [vfio_iommu_type1]
 vfio_pin_map_dma+0xd2/0x330 [vfio_iommu_type1]
 vfio_dma_do_map+0x2c0/0x440 [vfio_iommu_type1]
 vfio_iommu_type1_ioctl+0xc5/0x1d0 [vfio_iommu_type1]
 __x64_sys_ioctl+0x8a/0xc0
 do_syscall_64+0x8d/0x170
 ? rcu_core+0x8d/0x250
 ? __lock_release+0x5e/0x160
 ? rcu_core+0x8d/0x250
 ? lock_release+0x5f/0x120
 ? sched_clock+0xc/0x30
 ? sched_clock_cpu+0xb/0x190
 ? irqtime_account_irq+0x40/0xc0
 ? __local_bh_enable+0x54/0x60
 ? __do_softirq+0x315/0x3ca
 ? lockdep_hardirqs_on_prepare.part.0+0x97/0x140
 entry_SYSCALL_64_after_hwframe+0x76/0x7e
RIP: 0033:0x7f8300d0357b
Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 75 68 0f 00 f7 d8 64 89 01 48
RSP: 002b:00007f82ef3fb948 EFLAGS: 00000206 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f8300d0357b
RDX: 00007f82ef3fb990 RSI: 0000000000003b71 RDI: 0000000000000023
RBP: 00007f82ef3fb9c0 R08: 0000000000000000 R09: 0000561b7e0bcac2
R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
R13: 0000000200000000 R14: 0000381800000000 R15: 0000000000000000
 </TASK>
Reviewed-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240503143138.3562116-1-alex.williamson@redhat.comSigned-off-by: default avatarAlex Williamson <alex.williamson@redhat.com>
parent bb208810
...@@ -778,25 +778,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) ...@@ -778,25 +778,26 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
} }
struct vfio_pci_fill_info { struct vfio_pci_fill_info {
struct vfio_pci_dependent_device __user *devices;
struct vfio_pci_dependent_device __user *devices_end;
struct vfio_device *vdev; struct vfio_device *vdev;
struct vfio_pci_dependent_device *devices;
int nr_devices;
u32 count; u32 count;
u32 flags; u32 flags;
}; };
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
{ {
struct vfio_pci_dependent_device info = { struct vfio_pci_dependent_device *info;
.segment = pci_domain_nr(pdev->bus),
.bus = pdev->bus->number,
.devfn = pdev->devfn,
};
struct vfio_pci_fill_info *fill = data; struct vfio_pci_fill_info *fill = data;
fill->count++; /* The topology changed since we counted devices */
if (fill->devices >= fill->devices_end) if (fill->count >= fill->nr_devices)
return 0; return -EAGAIN;
info = &fill->devices[fill->count++];
info->segment = pci_domain_nr(pdev->bus);
info->bus = pdev->bus->number;
info->devfn = pdev->devfn;
if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
...@@ -809,19 +810,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) ...@@ -809,19 +810,19 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
*/ */
vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
if (!vdev) { if (!vdev) {
info.devid = VFIO_PCI_DEVID_NOT_OWNED; info->devid = VFIO_PCI_DEVID_NOT_OWNED;
} else { } else {
int id = vfio_iommufd_get_dev_id(vdev, iommufd); int id = vfio_iommufd_get_dev_id(vdev, iommufd);
if (id > 0) if (id > 0)
info.devid = id; info->devid = id;
else if (id == -ENOENT) else if (id == -ENOENT)
info.devid = VFIO_PCI_DEVID_OWNED; info->devid = VFIO_PCI_DEVID_OWNED;
else else
info.devid = VFIO_PCI_DEVID_NOT_OWNED; info->devid = VFIO_PCI_DEVID_NOT_OWNED;
} }
/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) if (info->devid == VFIO_PCI_DEVID_NOT_OWNED)
fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
} else { } else {
struct iommu_group *iommu_group; struct iommu_group *iommu_group;
...@@ -830,13 +831,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) ...@@ -830,13 +831,10 @@ static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
if (!iommu_group) if (!iommu_group)
return -EPERM; /* Cannot reset non-isolated devices */ return -EPERM; /* Cannot reset non-isolated devices */
info.group_id = iommu_group_id(iommu_group); info->group_id = iommu_group_id(iommu_group);
iommu_group_put(iommu_group); iommu_group_put(iommu_group);
} }
if (copy_to_user(fill->devices, &info, sizeof(info)))
return -EFAULT;
fill->devices++;
return 0; return 0;
} }
...@@ -1258,10 +1256,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( ...@@ -1258,10 +1256,11 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
{ {
unsigned long minsz = unsigned long minsz =
offsetofend(struct vfio_pci_hot_reset_info, count); offsetofend(struct vfio_pci_hot_reset_info, count);
struct vfio_pci_dependent_device *devices = NULL;
struct vfio_pci_hot_reset_info hdr; struct vfio_pci_hot_reset_info hdr;
struct vfio_pci_fill_info fill = {}; struct vfio_pci_fill_info fill = {};
bool slot = false; bool slot = false;
int ret = 0; int ret, count;
if (copy_from_user(&hdr, arg, minsz)) if (copy_from_user(&hdr, arg, minsz))
return -EFAULT; return -EFAULT;
...@@ -1277,9 +1276,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( ...@@ -1277,9 +1276,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
else if (pci_probe_reset_bus(vdev->pdev->bus)) else if (pci_probe_reset_bus(vdev->pdev->bus))
return -ENODEV; return -ENODEV;
fill.devices = arg->devices; ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
fill.devices_end = arg->devices + &count, slot);
(hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); if (ret)
return ret;
if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) {
hdr.count = count;
ret = -ENOSPC;
goto header;
}
devices = kcalloc(count, sizeof(*devices), GFP_KERNEL);
if (!devices)
return -ENOMEM;
fill.devices = devices;
fill.nr_devices = count;
fill.vdev = &vdev->vdev; fill.vdev = &vdev->vdev;
if (vfio_device_cdev_opened(&vdev->vdev)) if (vfio_device_cdev_opened(&vdev->vdev))
...@@ -1291,16 +1304,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( ...@@ -1291,16 +1304,23 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
&fill, slot); &fill, slot);
mutex_unlock(&vdev->vdev.dev_set->lock); mutex_unlock(&vdev->vdev.dev_set->lock);
if (ret) if (ret)
return ret; goto out;
if (copy_to_user(arg->devices, devices,
sizeof(*devices) * fill.count)) {
ret = -EFAULT;
goto out;
}
hdr.count = fill.count; hdr.count = fill.count;
hdr.flags = fill.flags; hdr.flags = fill.flags;
if (copy_to_user(arg, &hdr, minsz))
return -EFAULT;
if (fill.count > fill.devices - arg->devices) header:
return -ENOSPC; if (copy_to_user(arg, &hdr, minsz))
return 0; ret = -EFAULT;
out:
kfree(devices);
return ret;
} }
static int static int
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment