Commit 142262a1 authored by David Francis's avatar David Francis Committed by Alex Deucher

drm/amdgpu: Add EXT_COHERENT support for APU and NUMA systems

On gfx943 APU, EXT_COHERENT should give MTYPE_CC for local and
MTYPE_UC for nonlocal memory.

On NUMA systems, local memory gets the local mtype, set by an
override callback. If EXT_COHERENT is set, memory will be set as
MTYPE_UC by default, with local memory MTYPE_CC.

Add an option in the override function for this case, and
add a check to ensure it is not used on UNCACHED memory.

V2: Combined APU and NUMA code into one patch
V3: Fixed a potential nullptr in amdgpu_vm_bo_update
Signed-off-by: default avatarDavid Francis <David.Francis@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a395f7ff
...@@ -844,6 +844,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, ...@@ -844,6 +844,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
* @immediate: immediate submission in a page fault * @immediate: immediate submission in a page fault
* @unlocked: unlocked invalidation during MM callback * @unlocked: unlocked invalidation during MM callback
* @flush_tlb: trigger tlb invalidation after update completed * @flush_tlb: trigger tlb invalidation after update completed
* @allow_override: change MTYPE for local NUMA nodes
* @resv: fences we need to sync to * @resv: fences we need to sync to
* @start: start of mapped range * @start: start of mapped range
* @last: last mapped entry * @last: last mapped entry
...@@ -860,7 +861,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence, ...@@ -860,7 +861,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
* 0 for success, negative erro code for failure. * 0 for success, negative erro code for failure.
*/ */
int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
bool immediate, bool unlocked, bool flush_tlb, bool immediate, bool unlocked, bool flush_tlb, bool allow_override,
struct dma_resv *resv, uint64_t start, uint64_t last, struct dma_resv *resv, uint64_t start, uint64_t last,
uint64_t flags, uint64_t offset, uint64_t vram_base, uint64_t flags, uint64_t offset, uint64_t vram_base,
struct ttm_resource *res, dma_addr_t *pages_addr, struct ttm_resource *res, dma_addr_t *pages_addr,
...@@ -898,6 +899,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -898,6 +899,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
params.immediate = immediate; params.immediate = immediate;
params.pages_addr = pages_addr; params.pages_addr = pages_addr;
params.unlocked = unlocked; params.unlocked = unlocked;
params.allow_override = allow_override;
/* Implicitly sync to command submissions in the same VM before /* Implicitly sync to command submissions in the same VM before
* unmapping. Sync to moving fences before mapping. * unmapping. Sync to moving fences before mapping.
...@@ -1073,6 +1075,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, ...@@ -1073,6 +1075,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
struct ttm_resource *mem; struct ttm_resource *mem;
struct dma_fence **last_update; struct dma_fence **last_update;
bool flush_tlb = clear; bool flush_tlb = clear;
bool uncached;
struct dma_resv *resv; struct dma_resv *resv;
uint64_t vram_base; uint64_t vram_base;
uint64_t flags; uint64_t flags;
...@@ -1110,9 +1113,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, ...@@ -1110,9 +1113,11 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
vram_base = bo_adev->vm_manager.vram_base_offset; vram_base = bo_adev->vm_manager.vram_base_offset;
uncached = (bo->flags & AMDGPU_GEM_CREATE_UNCACHED) != 0;
} else { } else {
flags = 0x0; flags = 0x0;
vram_base = 0; vram_base = 0;
uncached = false;
} }
if (clear || (bo && bo->tbo.base.resv == if (clear || (bo && bo->tbo.base.resv ==
...@@ -1146,7 +1151,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, ...@@ -1146,7 +1151,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va,
trace_amdgpu_vm_bo_update(mapping); trace_amdgpu_vm_bo_update(mapping);
r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb,
resv, mapping->start, mapping->last, !uncached, resv, mapping->start, mapping->last,
update_flags, mapping->offset, update_flags, mapping->offset,
vram_base, mem, pages_addr, vram_base, mem, pages_addr,
last_update); last_update);
...@@ -1341,8 +1346,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev, ...@@ -1341,8 +1346,8 @@ int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
mapping->start < AMDGPU_GMC_HOLE_START) mapping->start < AMDGPU_GMC_HOLE_START)
init_pte_value = AMDGPU_PTE_DEFAULT_ATC; init_pte_value = AMDGPU_PTE_DEFAULT_ATC;
r = amdgpu_vm_update_range(adev, vm, false, false, true, resv, r = amdgpu_vm_update_range(adev, vm, false, false, true, false,
mapping->start, mapping->last, resv, mapping->start, mapping->last,
init_pte_value, 0, 0, NULL, NULL, init_pte_value, 0, 0, NULL, NULL,
&f); &f);
amdgpu_vm_free_mapping(adev, vm, mapping, f); amdgpu_vm_free_mapping(adev, vm, mapping, f);
...@@ -2618,8 +2623,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, ...@@ -2618,8 +2623,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
goto error_unlock; goto error_unlock;
} }
r = amdgpu_vm_update_range(adev, vm, true, false, false, NULL, addr, r = amdgpu_vm_update_range(adev, vm, true, false, false, false,
addr, flags, value, 0, NULL, NULL, NULL); NULL, addr, addr, flags, value, 0, NULL, NULL, NULL);
if (r) if (r)
goto error_unlock; goto error_unlock;
......
...@@ -246,6 +246,12 @@ struct amdgpu_vm_update_params { ...@@ -246,6 +246,12 @@ struct amdgpu_vm_update_params {
* @table_freed: return true if page table is freed when updating * @table_freed: return true if page table is freed when updating
*/ */
bool table_freed; bool table_freed;
/**
* @allow_override: true for memory that is not uncached: allows MTYPE
* to be overridden for NUMA local memory.
*/
bool allow_override;
}; };
struct amdgpu_vm_update_funcs { struct amdgpu_vm_update_funcs {
...@@ -441,7 +447,7 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, ...@@ -441,7 +447,7 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
struct amdgpu_vm *vm, struct amdgpu_bo *bo); struct amdgpu_vm *vm, struct amdgpu_bo *bo);
int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
bool immediate, bool unlocked, bool flush_tlb, bool immediate, bool unlocked, bool flush_tlb, bool allow_override,
struct dma_resv *resv, uint64_t start, uint64_t last, struct dma_resv *resv, uint64_t start, uint64_t last,
uint64_t flags, uint64_t offset, uint64_t vram_base, uint64_t flags, uint64_t offset, uint64_t vram_base,
struct ttm_resource *res, dma_addr_t *pages_addr, struct ttm_resource *res, dma_addr_t *pages_addr,
......
...@@ -843,7 +843,7 @@ static void amdgpu_vm_pte_update_flags(struct amdgpu_vm_update_params *params, ...@@ -843,7 +843,7 @@ static void amdgpu_vm_pte_update_flags(struct amdgpu_vm_update_params *params,
*/ */
if ((flags & AMDGPU_PTE_SYSTEM) && (adev->flags & AMD_IS_APU) && if ((flags & AMDGPU_PTE_SYSTEM) && (adev->flags & AMD_IS_APU) &&
adev->gmc.gmc_funcs->override_vm_pte_flags && adev->gmc.gmc_funcs->override_vm_pte_flags &&
num_possible_nodes() > 1 && !params->pages_addr) num_possible_nodes() > 1 && !params->pages_addr && params->allow_override)
amdgpu_gmc_override_vm_pte_flags(adev, params->vm, addr, &flags); amdgpu_gmc_override_vm_pte_flags(adev, params->vm, addr, &flags);
params->vm->update_funcs->update(params, pt, pe, addr, count, incr, params->vm->update_funcs->update(params, pt, pe, addr, count, incr,
......
...@@ -1251,12 +1251,15 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, ...@@ -1251,12 +1251,15 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev,
return; return;
} }
/* Only override mappings with MTYPE_NC, which is the safe default for /* MTYPE_NC is the same default and can be overridden.
* cacheable memory. * MTYPE_UC will be present if the memory is extended-coherent
* and can also be overridden.
*/ */
if ((*flags & AMDGPU_PTE_MTYPE_VG10_MASK) != if ((*flags & AMDGPU_PTE_MTYPE_VG10_MASK) !=
AMDGPU_PTE_MTYPE_VG10(MTYPE_NC)) { AMDGPU_PTE_MTYPE_VG10(MTYPE_NC) &&
dev_dbg_ratelimited(adev->dev, "MTYPE is not NC\n"); (*flags & AMDGPU_PTE_MTYPE_VG10_MASK) !=
AMDGPU_PTE_MTYPE_VG10(MTYPE_UC)) {
dev_dbg_ratelimited(adev->dev, "MTYPE is not NC or UC\n");
return; return;
} }
...@@ -1283,15 +1286,23 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev, ...@@ -1283,15 +1286,23 @@ static void gmc_v9_0_override_vm_pte_flags(struct amdgpu_device *adev,
vm->mem_id, local_node, nid); vm->mem_id, local_node, nid);
if (nid == local_node) { if (nid == local_node) {
uint64_t old_flags = *flags; uint64_t old_flags = *flags;
unsigned int mtype_local = MTYPE_RW; if ((*flags & AMDGPU_PTE_MTYPE_VG10_MASK) ==
AMDGPU_PTE_MTYPE_VG10(MTYPE_NC)) {
unsigned int mtype_local = MTYPE_RW;
if (amdgpu_mtype_local == 1) if (amdgpu_mtype_local == 1)
mtype_local = MTYPE_NC; mtype_local = MTYPE_NC;
else if (amdgpu_mtype_local == 2) else if (amdgpu_mtype_local == 2)
mtype_local = MTYPE_CC; mtype_local = MTYPE_CC;
*flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
AMDGPU_PTE_MTYPE_VG10(mtype_local);
} else {
/* MTYPE_UC case */
*flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
AMDGPU_PTE_MTYPE_VG10(MTYPE_CC);
}
*flags = (*flags & ~AMDGPU_PTE_MTYPE_VG10_MASK) |
AMDGPU_PTE_MTYPE_VG10(mtype_local);
dev_dbg_ratelimited(adev->dev, "flags updated from %llx to %llx\n", dev_dbg_ratelimited(adev->dev, "flags updated from %llx to %llx\n",
old_flags, *flags); old_flags, *flags);
} }
......
...@@ -1282,7 +1282,7 @@ svm_range_get_pte_flags(struct kfd_node *node, ...@@ -1282,7 +1282,7 @@ svm_range_get_pte_flags(struct kfd_node *node,
if (num_possible_nodes() <= 1) if (num_possible_nodes() <= 1)
mapping_flags |= mtype_local; mapping_flags |= mtype_local;
else else
mapping_flags |= AMDGPU_VM_MTYPE_NC; mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC;
/* system memory accessed by the dGPU */ /* system memory accessed by the dGPU */
} else { } else {
mapping_flags |= AMDGPU_VM_MTYPE_UC; mapping_flags |= AMDGPU_VM_MTYPE_UC;
...@@ -1317,7 +1317,7 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -1317,7 +1317,7 @@ svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm,
pr_debug("[0x%llx 0x%llx]\n", start, last); pr_debug("[0x%llx 0x%llx]\n", start, last);
return amdgpu_vm_update_range(adev, vm, false, true, true, NULL, start, return amdgpu_vm_update_range(adev, vm, false, true, true, false, NULL, start,
last, init_pte_value, 0, 0, NULL, NULL, last, init_pte_value, 0, 0, NULL, NULL,
fence); fence);
} }
...@@ -1424,8 +1424,8 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, ...@@ -1424,8 +1424,8 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
* different memory partition based on fpfn/lpfn, we should use * different memory partition based on fpfn/lpfn, we should use
* same vm_manager.vram_base_offset regardless memory partition. * same vm_manager.vram_base_offset regardless memory partition.
*/ */
r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, NULL, r = amdgpu_vm_update_range(adev, vm, false, false, flush_tlb, true,
last_start, prange->start + i, NULL, last_start, prange->start + i,
pte_flags, pte_flags,
(last_start - prange->start) << PAGE_SHIFT, (last_start - prange->start) << PAGE_SHIFT,
bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, bo_adev ? bo_adev->vm_manager.vram_base_offset : 0,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment