Commit ef51d754 authored by Thomas Hellström's avatar Thomas Hellström

drm/xe/migrate: Fix CCS copy for small VRAM copy chunks

Since the migrate code is using the identity map for addressing VRAM,
copy chunks may become as small as 64K if the VRAM resource is fragmented.

However, a chunk size smaller that 1MiB may lead to the *next* chunk's
offset into the CCS metadata backup memory may not be page-aligned, and
the XY_CTRL_SURF_COPY_BLT command can't handle that, and even if it could,
the current code doesn't handle the offset calculaton correctly.

To fix this, make sure we align the size of VRAM copy chunks to 1MiB. If
the remaining data to copy is smaller than that, that's not a problem,
so use the remaining size. If the VRAM copy cunk becomes fragmented due
to the size alignment restriction, don't use the identity map, but instead
emit PTEs into the page-table like we do for system memory.

v2:
- Rebase
v3:
- Future proof somewhat by taking into account the real data size to
  flat CCS metadata size ratio. (Matt Roper)
- Invert a couple of if-statements for better readability.
- Fix support for 4K-granularity VRAM sizes. (Tested on DG1).
v4:
- Fix up code comments
- Fix debug printout format typo.
v5:
- Add a Fixes: tag.

Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Matthew Auld <matthew.william.auld@gmail.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Fixes: e89b384c ("drm/xe/migrate: Update emit_pte to cope with a size level than 4k")
Signed-off-by: default avatarThomas Hellström <thomas.hellstrom@linux.intel.com>
Reviewed-by: default avatarMatthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240110163415.524165-1-thomas.hellstrom@linux.intel.com
parent cf46019e
...@@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test) ...@@ -331,7 +331,7 @@ static void xe_migrate_sanity_test(struct xe_migrate *m, struct kunit *test)
xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it); xe_res_first_sg(xe_bo_sg(pt), 0, pt->size, &src_it);
emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false, emit_pte(m, bb, NUM_KERNEL_PDE - 1, xe_bo_is_vram(pt), false,
&src_it, XE_PAGE_SIZE, pt); &src_it, XE_PAGE_SIZE, pt->ttm.resource);
run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test); run_sanity_job(m, xe, bb, bb->len, "Writing PTE for our fake PT", test);
......
...@@ -62,6 +62,8 @@ struct xe_migrate { ...@@ -62,6 +62,8 @@ struct xe_migrate {
* out of the pt_bo. * out of the pt_bo.
*/ */
struct drm_suballoc_manager vm_update_sa; struct drm_suballoc_manager vm_update_sa;
/** @min_chunk_size: For dgfx, Minimum chunk size */
u64 min_chunk_size;
}; };
#define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */ #define MAX_PREEMPTDISABLE_TRANSFER SZ_8M /* Around 1ms. */
...@@ -363,6 +365,19 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile) ...@@ -363,6 +365,19 @@ struct xe_migrate *xe_migrate_init(struct xe_tile *tile)
if (err) if (err)
return ERR_PTR(err); return ERR_PTR(err);
if (IS_DGFX(xe)) {
if (xe_device_has_flat_ccs(xe))
/* min chunk size corresponds to 4K of CCS Metadata */
m->min_chunk_size = SZ_4K * SZ_64K /
xe_device_ccs_bytes(xe, SZ_64K);
else
/* Somewhat arbitrary to avoid a huge amount of blits */
m->min_chunk_size = SZ_64K;
m->min_chunk_size = roundup_pow_of_two(m->min_chunk_size);
drm_dbg(&xe->drm, "Migrate min chunk size is 0x%08llx\n",
(unsigned long long)m->min_chunk_size);
}
return m; return m;
} }
...@@ -374,16 +389,35 @@ static u64 max_mem_transfer_per_pass(struct xe_device *xe) ...@@ -374,16 +389,35 @@ static u64 max_mem_transfer_per_pass(struct xe_device *xe)
return MAX_PREEMPTDISABLE_TRANSFER; return MAX_PREEMPTDISABLE_TRANSFER;
} }
static u64 xe_migrate_res_sizes(struct xe_device *xe, struct xe_res_cursor *cur) static u64 xe_migrate_res_sizes(struct xe_migrate *m, struct xe_res_cursor *cur)
{ {
/* struct xe_device *xe = tile_to_xe(m->tile);
* For VRAM we use identity mapped pages so we are limited to current u64 size = min_t(u64, max_mem_transfer_per_pass(xe), cur->remaining);
* cursor size. For system we program the pages ourselves so we have no
* such limitation. if (mem_type_is_vram(cur->mem_type)) {
*/ /*
return min_t(u64, max_mem_transfer_per_pass(xe), * VRAM we want to blit in chunks with sizes aligned to
mem_type_is_vram(cur->mem_type) ? cur->size : * min_chunk_size in order for the offset to CCS metadata to be
cur->remaining); * page-aligned. If it's the last chunk it may be smaller.
*
* Another constraint is that we need to limit the blit to
* the VRAM block size, unless size is smaller than
* min_chunk_size.
*/
u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
size = min_t(u64, size, chunk);
if (size > m->min_chunk_size)
size = round_down(size, m->min_chunk_size);
}
return size;
}
static bool xe_migrate_allow_identity(u64 size, const struct xe_res_cursor *cur)
{
/* If the chunk is not fragmented, allow identity map. */
return cur->size >= size;
} }
static u32 pte_update_size(struct xe_migrate *m, static u32 pte_update_size(struct xe_migrate *m,
...@@ -396,7 +430,12 @@ static u32 pte_update_size(struct xe_migrate *m, ...@@ -396,7 +430,12 @@ static u32 pte_update_size(struct xe_migrate *m,
u32 cmds = 0; u32 cmds = 0;
*L0_pt = pt_ofs; *L0_pt = pt_ofs;
if (!is_vram) { if (is_vram && xe_migrate_allow_identity(*L0, cur)) {
/* Offset into identity map. */
*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
cur->start + vram_region_gpu_offset(res));
cmds += cmd_size;
} else {
/* Clip L0 to available size */ /* Clip L0 to available size */
u64 size = min(*L0, (u64)avail_pts * SZ_2M); u64 size = min(*L0, (u64)avail_pts * SZ_2M);
u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE); u64 num_4k_pages = DIV_ROUND_UP(size, XE_PAGE_SIZE);
...@@ -412,11 +451,6 @@ static u32 pte_update_size(struct xe_migrate *m, ...@@ -412,11 +451,6 @@ static u32 pte_update_size(struct xe_migrate *m,
/* Each chunk has a single blit command */ /* Each chunk has a single blit command */
cmds += cmd_size; cmds += cmd_size;
} else {
/* Offset into identity map. */
*L0_ofs = xe_migrate_vram_ofs(tile_to_xe(m->tile),
cur->start + vram_region_gpu_offset(res));
cmds += cmd_size;
} }
return cmds; return cmds;
...@@ -426,10 +460,10 @@ static void emit_pte(struct xe_migrate *m, ...@@ -426,10 +460,10 @@ static void emit_pte(struct xe_migrate *m,
struct xe_bb *bb, u32 at_pt, struct xe_bb *bb, u32 at_pt,
bool is_vram, bool is_comp_pte, bool is_vram, bool is_comp_pte,
struct xe_res_cursor *cur, struct xe_res_cursor *cur,
u32 size, struct xe_bo *bo) u32 size, struct ttm_resource *res)
{ {
struct xe_device *xe = tile_to_xe(m->tile); struct xe_device *xe = tile_to_xe(m->tile);
struct xe_vm *vm = m->q->vm;
u16 pat_index; u16 pat_index;
u32 ptes; u32 ptes;
u64 ofs = at_pt * XE_PAGE_SIZE; u64 ofs = at_pt * XE_PAGE_SIZE;
...@@ -442,13 +476,6 @@ static void emit_pte(struct xe_migrate *m, ...@@ -442,13 +476,6 @@ static void emit_pte(struct xe_migrate *m,
else else
pat_index = xe->pat.idx[XE_CACHE_WB]; pat_index = xe->pat.idx[XE_CACHE_WB];
/*
* FIXME: Emitting VRAM PTEs to L0 PTs is forbidden. Currently
* we're only emitting VRAM PTEs during sanity tests, so when
* that's moved to a Kunit test, we should condition VRAM PTEs
* on running tests.
*/
ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE); ptes = DIV_ROUND_UP(size, XE_PAGE_SIZE);
while (ptes) { while (ptes) {
...@@ -468,20 +495,22 @@ static void emit_pte(struct xe_migrate *m, ...@@ -468,20 +495,22 @@ static void emit_pte(struct xe_migrate *m,
addr = xe_res_dma(cur) & PAGE_MASK; addr = xe_res_dma(cur) & PAGE_MASK;
if (is_vram) { if (is_vram) {
/* Is this a 64K PTE entry? */ if (vm->flags & XE_VM_FLAG_64K) {
if ((m->q->vm->flags & XE_VM_FLAG_64K) && u64 va = cur_ofs * XE_PAGE_SIZE / 8;
!(cur_ofs & (16 * 8 - 1))) {
xe_tile_assert(m->tile, IS_ALIGNED(addr, SZ_64K)); xe_assert(xe, (va & (SZ_64K - 1)) ==
(addr & (SZ_64K - 1)));
flags |= XE_PTE_PS64; flags |= XE_PTE_PS64;
} }
addr += vram_region_gpu_offset(bo->ttm.resource); addr += vram_region_gpu_offset(res);
devmem = true; devmem = true;
} }
addr = m->q->vm->pt_ops->pte_encode_addr(m->tile->xe, addr = vm->pt_ops->pte_encode_addr(m->tile->xe,
addr, pat_index, addr, pat_index,
0, devmem, flags); 0, devmem, flags);
bb->cs[bb->len++] = lower_32_bits(addr); bb->cs[bb->len++] = lower_32_bits(addr);
bb->cs[bb->len++] = upper_32_bits(addr); bb->cs[bb->len++] = upper_32_bits(addr);
...@@ -693,8 +722,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, ...@@ -693,8 +722,8 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
bool usm = xe->info.has_usm; bool usm = xe->info.has_usm;
u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
src_L0 = xe_migrate_res_sizes(xe, &src_it); src_L0 = xe_migrate_res_sizes(m, &src_it);
dst_L0 = xe_migrate_res_sizes(xe, &dst_it); dst_L0 = xe_migrate_res_sizes(m, &dst_it);
drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n", drm_dbg(&xe->drm, "Pass %u, sizes: %llu & %llu\n",
pass++, src_L0, dst_L0); pass++, src_L0, dst_L0);
...@@ -715,6 +744,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, ...@@ -715,6 +744,7 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
&ccs_ofs, &ccs_pt, 0, &ccs_ofs, &ccs_pt, 0,
2 * avail_pts, 2 * avail_pts,
avail_pts); avail_pts);
xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
} }
/* Add copy commands size here */ /* Add copy commands size here */
...@@ -727,20 +757,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, ...@@ -727,20 +757,20 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
goto err_sync; goto err_sync;
} }
if (!src_is_vram) if (src_is_vram && xe_migrate_allow_identity(src_L0, &src_it))
emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
src_bo);
else
xe_res_next(&src_it, src_L0); xe_res_next(&src_it, src_L0);
if (!dst_is_vram)
emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
dst_bo);
else else
emit_pte(m, bb, src_L0_pt, src_is_vram, true, &src_it, src_L0,
src);
if (dst_is_vram && xe_migrate_allow_identity(src_L0, &dst_it))
xe_res_next(&dst_it, src_L0); xe_res_next(&dst_it, src_L0);
else
emit_pte(m, bb, dst_L0_pt, dst_is_vram, true, &dst_it, src_L0,
dst);
if (copy_system_ccs) if (copy_system_ccs)
emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src_bo); emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);
bb->cs[bb->len++] = MI_BATCH_BUFFER_END; bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
update_idx = bb->len; update_idx = bb->len;
...@@ -949,7 +979,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, ...@@ -949,7 +979,7 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
bool usm = xe->info.has_usm; bool usm = xe->info.has_usm;
u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE; u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;
clear_L0 = xe_migrate_res_sizes(xe, &src_it); clear_L0 = xe_migrate_res_sizes(m, &src_it);
drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0); drm_dbg(&xe->drm, "Pass %u, size: %llu\n", pass++, clear_L0);
...@@ -976,12 +1006,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, ...@@ -976,12 +1006,12 @@ struct dma_fence *xe_migrate_clear(struct xe_migrate *m,
size -= clear_L0; size -= clear_L0;
/* Preemption is enabled again by the ring ops. */ /* Preemption is enabled again by the ring ops. */
if (!clear_vram) { if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
bo);
} else {
xe_res_next(&src_it, clear_L0); xe_res_next(&src_it, clear_L0);
} else
emit_pte(m, bb, clear_L0_pt, clear_vram, true, &src_it, clear_L0,
dst);
bb->cs[bb->len++] = MI_BATCH_BUFFER_END; bb->cs[bb->len++] = MI_BATCH_BUFFER_END;
update_idx = bb->len; update_idx = bb->len;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment