Commit e45b011d authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher

drm/amdkfd: Fix CU occupancy for GFX 9.4.3

Make CU occupancy calculations work on GFX 9.4.3 by
updating the logic to handle multiple XCCs correctly.
Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarHarish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6ae9e1ab
...@@ -963,14 +963,14 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx, ...@@ -963,14 +963,14 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
*/ */
pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst); soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst),
mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
if (wave_cnt != 0) { if (wave_cnt != 0) {
queue_cnt->wave_cnt += wave_cnt; queue_cnt->wave_cnt += wave_cnt;
queue_cnt->doorbell_off = queue_cnt->doorbell_off =
(RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) & (RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) &
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
} }
...@@ -1033,7 +1033,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, ...@@ -1033,7 +1033,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
lock_spi_csq_mutexes(adev); lock_spi_csq_mutexes(adev);
soc15_grbm_select(adev, 1, 0, 0, 0, inst); soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));
/* /*
* Iterate through the shader engines and arrays of the device * Iterate through the shader engines and arrays of the device
...@@ -1046,7 +1046,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, ...@@ -1046,7 +1046,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
se_cnt = adev->gfx.config.max_shader_engines; se_cnt = adev->gfx.config.max_shader_engines;
for (se_idx = 0; se_idx < se_cnt; se_idx++) { for (se_idx = 0; se_idx < se_cnt; se_idx++) {
amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS);
/* /*
* Assumption: queue map encodes following schema: four * Assumption: queue map encodes following schema: four
...@@ -1071,7 +1071,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, ...@@ -1071,7 +1071,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
} }
amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst); amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
soc15_grbm_select(adev, 0, 0, 0, 0, inst); soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
unlock_spi_csq_mutexes(adev); unlock_spi_csq_mutexes(adev);
/* Update the output parameters and return */ /* Update the output parameters and return */
......
...@@ -3542,15 +3542,19 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) ...@@ -3542,15 +3542,19 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
struct qcm_process_device *qpd, struct qcm_process_device *qpd,
int doorbell_off) int doorbell_off, u32 *queue_format)
{ {
struct queue *q; struct queue *q;
bool r = false; bool r = false;
if (!queue_format)
return r;
dqm_lock(dqm); dqm_lock(dqm);
list_for_each_entry(q, &qpd->queues_list, list) { list_for_each_entry(q, &qpd->queues_list, list) {
if (q->properties.doorbell_off == doorbell_off) { if (q->properties.doorbell_off == doorbell_off) {
*queue_format = q->properties.format;
r = true; r = true;
goto out; goto out;
} }
......
...@@ -326,7 +326,7 @@ int debug_map_and_unlock(struct device_queue_manager *dqm); ...@@ -326,7 +326,7 @@ int debug_map_and_unlock(struct device_queue_manager *dqm);
int debug_refresh_runlist(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm);
bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
struct qcm_process_device *qpd, struct qcm_process_device *qpd,
int doorbell_off); int doorbell_off, u32 *queue_format);
static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
{ {
......
...@@ -272,6 +272,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) ...@@ -272,6 +272,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
struct kfd_process_device *pdd = NULL; struct kfd_process_device *pdd = NULL;
int i; int i;
struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES]; struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
u32 queue_format;
memset(cu_occupancy, 0x0, sizeof(cu_occupancy)); memset(cu_occupancy, 0x0, sizeof(cu_occupancy));
...@@ -292,14 +293,27 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) ...@@ -292,14 +293,27 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
wave_cnt = 0; wave_cnt = 0;
max_waves_per_cu = 0; max_waves_per_cu = 0;
/*
* For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition.
* For AQL queues, because of cooperative dispatch we multiply the wave count
* by number of XCCs in the partition to get the total wave counts across all
* XCCs in the partition.
* For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is.
*/
dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
&max_waves_per_cu, 0); &max_waves_per_cu, ffs(dev->xcc_mask) - 1);
for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
if (cu_occupancy[i].wave_cnt != 0 && if (cu_occupancy[i].wave_cnt != 0 &&
kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
cu_occupancy[i].doorbell_off)) cu_occupancy[i].doorbell_off,
&queue_format)) {
if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4))
wave_cnt += cu_occupancy[i].wave_cnt; wave_cnt += cu_occupancy[i].wave_cnt;
else
wave_cnt += (NUM_XCC(dev->xcc_mask) *
cu_occupancy[i].wave_cnt);
}
} }
/* Translate wave count to number of compute units */ /* Translate wave count to number of compute units */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment