Commit cf4ee73f authored by Changbin Du's avatar Changbin Du Committed by Zhenyu Wang

drm/i915/gvt: Fix guest vGPU hang caused by very high dma setup overhead

The implementation of current kvmgt implicitly setup dma mapping at MPT
API gfn_to_mfn. First this design against the API's original purpose.
Second, there is no unmap hit in this design. The result is that the
dma mapping keep growing larger and larger. For mutl-vm case, they will
consume IOMMU IOVA low 4GB address space quickly and so tons of rbtree
entries crated in the IOMMU IOVA allocator. Finally, single IOVA
allocation can take as long as ~70ms. Such latency is intolerable.

To address both above issues, this patch introduced two new MPT API:
  o dma_map_guest_page - setup dma map for guest page
  o dma_unmap_guest_page - cancel dma map for guest page

The kvmgt implements these 2 API. And to reduce dma setup overhead for
duplicated pages (eg. scratch pages), two caches are used: one is for
mapping gfn to struct gvt_dma, another is for mapping dma addr to
struct gvt_dma.

With these 2 new API, the gtt now is able to cancel dma mapping when page
table is invalidated. The dma mapping is not in a gradual increase now.

v2: follow the old logic for VFIO_IOMMU_NOTIFY_DMA_UNMAP at this point.

Cc: Hang Yuan <hang.yuan@intel.com>
Cc: Xiong Zhang <xiong.y.zhang@intel.com>
Signed-off-by: default avatarChangbin Du <changbin.du@intel.com>
Signed-off-by: default avatarZhenyu Wang <zhenyuw@linux.intel.com>
parent b52646fd
...@@ -822,6 +822,23 @@ static int ppgtt_invalidate_spt_by_shadow_entry(struct intel_vgpu *vgpu, ...@@ -822,6 +822,23 @@ static int ppgtt_invalidate_spt_by_shadow_entry(struct intel_vgpu *vgpu,
return ppgtt_invalidate_spt(s); return ppgtt_invalidate_spt(s);
} }
static inline void ppgtt_invalidate_pte(struct intel_vgpu_ppgtt_spt *spt,
struct intel_gvt_gtt_entry *entry)
{
struct intel_vgpu *vgpu = spt->vgpu;
struct intel_gvt_gtt_pte_ops *ops = vgpu->gvt->gtt.pte_ops;
unsigned long pfn;
int type;
pfn = ops->get_pfn(entry);
type = spt->shadow_page.type;
if (pfn == vgpu->gtt.scratch_pt[type].page_mfn)
return;
intel_gvt_hypervisor_dma_unmap_guest_page(vgpu, pfn << PAGE_SHIFT);
}
static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt) static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt)
{ {
struct intel_vgpu *vgpu = spt->vgpu; struct intel_vgpu *vgpu = spt->vgpu;
...@@ -838,14 +855,12 @@ static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt) ...@@ -838,14 +855,12 @@ static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt)
if (atomic_dec_return(&spt->refcount) > 0) if (atomic_dec_return(&spt->refcount) > 0)
return 0; return 0;
if (gtt_type_is_pte_pt(spt->shadow_page.type))
goto release;
for_each_present_shadow_entry(spt, &e, index) { for_each_present_shadow_entry(spt, &e, index) {
switch (e.type) { switch (e.type) {
case GTT_TYPE_PPGTT_PTE_4K_ENTRY: case GTT_TYPE_PPGTT_PTE_4K_ENTRY:
gvt_vdbg_mm("invalidate 4K entry\n"); gvt_vdbg_mm("invalidate 4K entry\n");
continue; ppgtt_invalidate_pte(spt, &e);
break;
case GTT_TYPE_PPGTT_PTE_2M_ENTRY: case GTT_TYPE_PPGTT_PTE_2M_ENTRY:
case GTT_TYPE_PPGTT_PTE_1G_ENTRY: case GTT_TYPE_PPGTT_PTE_1G_ENTRY:
WARN(1, "GVT doesn't support 2M/1GB page\n"); WARN(1, "GVT doesn't support 2M/1GB page\n");
...@@ -863,7 +878,7 @@ static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt) ...@@ -863,7 +878,7 @@ static int ppgtt_invalidate_spt(struct intel_vgpu_ppgtt_spt *spt)
GEM_BUG_ON(1); GEM_BUG_ON(1);
} }
} }
release:
trace_spt_change(spt->vgpu->id, "release", spt, trace_spt_change(spt->vgpu->id, "release", spt,
spt->guest_page.gfn, spt->shadow_page.type); spt->guest_page.gfn, spt->shadow_page.type);
ppgtt_free_spt(spt); ppgtt_free_spt(spt);
...@@ -932,7 +947,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu, ...@@ -932,7 +947,9 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
{ {
struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops; struct intel_gvt_gtt_pte_ops *pte_ops = vgpu->gvt->gtt.pte_ops;
struct intel_gvt_gtt_entry se = *ge; struct intel_gvt_gtt_entry se = *ge;
unsigned long gfn, mfn; unsigned long gfn;
dma_addr_t dma_addr;
int ret;
if (!pte_ops->test_present(ge)) if (!pte_ops->test_present(ge))
return 0; return 0;
...@@ -952,11 +969,11 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu, ...@@ -952,11 +969,11 @@ static int ppgtt_populate_shadow_entry(struct intel_vgpu *vgpu,
}; };
/* direct shadow */ /* direct shadow */
mfn = intel_gvt_hypervisor_gfn_to_mfn(vgpu, gfn); ret = intel_gvt_hypervisor_dma_map_guest_page(vgpu, gfn, &dma_addr);
if (mfn == INTEL_GVT_INVALID_ADDR) if (ret)
return -ENXIO; return -ENXIO;
pte_ops->set_pfn(&se, mfn); pte_ops->set_pfn(&se, dma_addr >> PAGE_SHIFT);
ppgtt_set_shadow_entry(spt, &se, index); ppgtt_set_shadow_entry(spt, &se, index);
return 0; return 0;
} }
...@@ -1035,7 +1052,9 @@ static int ppgtt_handle_guest_entry_removal(struct intel_vgpu_ppgtt_spt *spt, ...@@ -1035,7 +1052,9 @@ static int ppgtt_handle_guest_entry_removal(struct intel_vgpu_ppgtt_spt *spt,
ret = ppgtt_invalidate_spt(s); ret = ppgtt_invalidate_spt(s);
if (ret) if (ret)
goto fail; goto fail;
} } else
ppgtt_invalidate_pte(spt, se);
return 0; return 0;
fail: fail:
gvt_vgpu_err("fail: shadow page %p guest entry 0x%llx type %d\n", gvt_vgpu_err("fail: shadow page %p guest entry 0x%llx type %d\n",
...@@ -1807,8 +1826,10 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, ...@@ -1807,8 +1826,10 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm; struct intel_vgpu_mm *ggtt_mm = vgpu->gtt.ggtt_mm;
struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops; struct intel_gvt_gtt_pte_ops *ops = gvt->gtt.pte_ops;
unsigned long g_gtt_index = off >> info->gtt_entry_size_shift; unsigned long g_gtt_index = off >> info->gtt_entry_size_shift;
unsigned long gma, gfn, mfn; unsigned long gma, gfn;
struct intel_gvt_gtt_entry e, m; struct intel_gvt_gtt_entry e, m;
dma_addr_t dma_addr;
int ret;
if (bytes != 4 && bytes != 8) if (bytes != 4 && bytes != 8)
return -EINVAL; return -EINVAL;
...@@ -1836,8 +1857,9 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, ...@@ -1836,8 +1857,9 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
goto out; goto out;
} }
mfn = intel_gvt_hypervisor_gfn_to_mfn(vgpu, gfn); ret = intel_gvt_hypervisor_dma_map_guest_page(vgpu, gfn,
if (mfn == INTEL_GVT_INVALID_ADDR) { &dma_addr);
if (ret) {
gvt_vgpu_err("fail to populate guest ggtt entry\n"); gvt_vgpu_err("fail to populate guest ggtt entry\n");
/* guest driver may read/write the entry when partial /* guest driver may read/write the entry when partial
* update the entry in this situation p2m will fail * update the entry in this situation p2m will fail
...@@ -1845,7 +1867,7 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off, ...@@ -1845,7 +1867,7 @@ static int emulate_ggtt_mmio_write(struct intel_vgpu *vgpu, unsigned int off,
*/ */
ops->set_pfn(&m, gvt->gtt.scratch_mfn); ops->set_pfn(&m, gvt->gtt.scratch_mfn);
} else } else
ops->set_pfn(&m, mfn); ops->set_pfn(&m, dma_addr >> PAGE_SHIFT);
} else } else
ops->set_pfn(&m, gvt->gtt.scratch_mfn); ops->set_pfn(&m, gvt->gtt.scratch_mfn);
......
...@@ -201,8 +201,15 @@ struct intel_vgpu { ...@@ -201,8 +201,15 @@ struct intel_vgpu {
int num_regions; int num_regions;
struct eventfd_ctx *intx_trigger; struct eventfd_ctx *intx_trigger;
struct eventfd_ctx *msi_trigger; struct eventfd_ctx *msi_trigger;
struct rb_root cache;
/*
* Two caches are used to avoid mapping duplicated pages (eg.
* scratch pages). This help to reduce dma setup overhead.
*/
struct rb_root gfn_cache;
struct rb_root dma_addr_cache;
struct mutex cache_lock; struct mutex cache_lock;
struct notifier_block iommu_notifier; struct notifier_block iommu_notifier;
struct notifier_block group_notifier; struct notifier_block group_notifier;
struct kvm *kvm; struct kvm *kvm;
......
...@@ -51,6 +51,11 @@ struct intel_gvt_mpt { ...@@ -51,6 +51,11 @@ struct intel_gvt_mpt {
int (*write_gpa)(unsigned long handle, unsigned long gpa, void *buf, int (*write_gpa)(unsigned long handle, unsigned long gpa, void *buf,
unsigned long len); unsigned long len);
unsigned long (*gfn_to_mfn)(unsigned long handle, unsigned long gfn); unsigned long (*gfn_to_mfn)(unsigned long handle, unsigned long gfn);
int (*dma_map_guest_page)(unsigned long handle, unsigned long gfn,
dma_addr_t *dma_addr);
void (*dma_unmap_guest_page)(unsigned long handle, dma_addr_t dma_addr);
int (*map_gfn_to_mfn)(unsigned long handle, unsigned long gfn, int (*map_gfn_to_mfn)(unsigned long handle, unsigned long gfn,
unsigned long mfn, unsigned int nr, bool map); unsigned long mfn, unsigned int nr, bool map);
int (*set_trap_area)(unsigned long handle, u64 start, u64 end, int (*set_trap_area)(unsigned long handle, u64 start, u64 end,
......
This diff is collapsed.
...@@ -227,6 +227,34 @@ static inline unsigned long intel_gvt_hypervisor_gfn_to_mfn( ...@@ -227,6 +227,34 @@ static inline unsigned long intel_gvt_hypervisor_gfn_to_mfn(
return intel_gvt_host.mpt->gfn_to_mfn(vgpu->handle, gfn); return intel_gvt_host.mpt->gfn_to_mfn(vgpu->handle, gfn);
} }
/**
* intel_gvt_hypervisor_dma_map_guest_page - setup dma map for guest page
* @vgpu: a vGPU
* @gpfn: guest pfn
* @dma_addr: retrieve allocated dma addr
*
* Returns:
* 0 on success, negative error code if failed.
*/
static inline int intel_gvt_hypervisor_dma_map_guest_page(
struct intel_vgpu *vgpu, unsigned long gfn,
dma_addr_t *dma_addr)
{
return intel_gvt_host.mpt->dma_map_guest_page(vgpu->handle, gfn,
dma_addr);
}
/**
* intel_gvt_hypervisor_dma_unmap_guest_page - cancel dma map for guest page
* @vgpu: a vGPU
* @dma_addr: the mapped dma addr
*/
static inline void intel_gvt_hypervisor_dma_unmap_guest_page(
struct intel_vgpu *vgpu, dma_addr_t dma_addr)
{
intel_gvt_host.mpt->dma_unmap_guest_page(vgpu->handle, dma_addr);
}
/** /**
* intel_gvt_hypervisor_map_gfn_to_mfn - map a GFN region to MFN * intel_gvt_hypervisor_map_gfn_to_mfn - map a GFN region to MFN
* @vgpu: a vGPU * @vgpu: a vGPU
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment