Commit a70a93fa authored by Jonathan Kim's avatar Jonathan Kim Committed by Alex Deucher

drm/amdkfd: add debug suspend and resume process queues operation

In order to inspect waves from the saved context at any point during a
debug session, the debugger must be able to preempt queues to trigger
context save by suspending them.

On queue suspend, the KFD will copy the context save header information
so that the debugger can correctly crawl the appropriate size of the saved
context. The debugger must then also be allowed to resume suspended queues.

A queue that is newly created cannot be suspended because queue ids are
recycled after destruction so the debugger needs to know that this has
occurred.  Query functions will be later added that will clear a given
queue of its new queue status.

A queue cannot be destroyed while it is suspended to preserve its saved
context during debugger inspection.  Have queue destruction block while
a queue is suspended and unblocked when it is resumed.  Likewise, if a
queue is about to be destroyed, it cannot be suspended.

Return the number of queues successfully suspended or resumed along with
a per queue status array where the upper bits per queue status show that
the request was invalid (new/destroyed queue suspend request, missing
queue) or an error occurred (HWS in a fatal state so it can't suspend or
resume queues).
Signed-off-by: default avatarJonathan Kim <jonathan.kim@amd.com>
Reviewed-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent aea1b473
...@@ -772,6 +772,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev) ...@@ -772,6 +772,11 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
return adev->have_atomics_support; return adev->have_atomics_support;
} }
void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev)
{
amdgpu_device_flush_hdp(adev, NULL);
}
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset)
{ {
amdgpu_umc_poison_handler(adev, reset); amdgpu_umc_poison_handler(adev, reset);
......
...@@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev, ...@@ -322,6 +322,7 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
uint64_t *mmap_offset); uint64_t *mmap_offset);
int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem, int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
struct dma_buf **dmabuf); struct dma_buf **dmabuf);
void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev);
int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config); struct tile_config *config);
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
......
...@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, ...@@ -410,6 +410,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
pr_debug("Write ptr address == 0x%016llX\n", pr_debug("Write ptr address == 0x%016llX\n",
args->write_pointer_address); args->write_pointer_address);
kfd_dbg_ev_raise(KFD_EC_MASK(EC_QUEUE_NEW), p, dev, queue_id, false, NULL, 0);
return 0; return 0;
err_create_queue: err_create_queue:
...@@ -2996,7 +2997,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v ...@@ -2996,7 +2997,17 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
args->launch_mode.launch_mode); args->launch_mode.launch_mode);
break; break;
case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES: case KFD_IOC_DBG_TRAP_SUSPEND_QUEUES:
r = suspend_queues(target,
args->suspend_queues.num_queues,
args->suspend_queues.grace_period,
args->suspend_queues.exception_mask,
(uint32_t *)args->suspend_queues.queue_array_ptr);
break;
case KFD_IOC_DBG_TRAP_RESUME_QUEUES: case KFD_IOC_DBG_TRAP_RESUME_QUEUES:
r = resume_queues(target, args->resume_queues.num_queues,
(uint32_t *)args->resume_queues.queue_array_ptr);
break;
case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH: case KFD_IOC_DBG_TRAP_SET_NODE_ADDRESS_WATCH:
case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH: case KFD_IOC_DBG_TRAP_CLEAR_NODE_ADDRESS_WATCH:
case KFD_IOC_DBG_TRAP_SET_FLAGS: case KFD_IOC_DBG_TRAP_SET_FLAGS:
......
...@@ -339,6 +339,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind ...@@ -339,6 +339,13 @@ void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind
} }
kfd_dbg_set_workaround(target, false); kfd_dbg_set_workaround(target, false);
if (!unwind) {
int resume_count = resume_queues(target, 0, NULL);
if (resume_count)
pr_debug("Resumed %d queues\n", resume_count);
}
} }
static void kfd_dbg_clean_exception_status(struct kfd_process *target) static void kfd_dbg_clean_exception_status(struct kfd_process *target)
......
...@@ -263,6 +263,8 @@ struct device_queue_manager { ...@@ -263,6 +263,8 @@ struct device_queue_manager {
uint32_t current_logical_xcc_start; uint32_t current_logical_xcc_start;
uint32_t wait_times; uint32_t wait_times;
wait_queue_head_t destroy_wait;
}; };
void device_queue_manager_init_cik( void device_queue_manager_init_cik(
...@@ -290,6 +292,14 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm, ...@@ -290,6 +292,14 @@ int reserve_debug_trap_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd); struct qcm_process_device *qpd);
int release_debug_trap_vmid(struct device_queue_manager *dqm, int release_debug_trap_vmid(struct device_queue_manager *dqm,
struct qcm_process_device *qpd); struct qcm_process_device *qpd);
int suspend_queues(struct kfd_process *p,
uint32_t num_queues,
uint32_t grace_period,
uint64_t exception_clear_mask,
uint32_t *usr_queue_id_array);
int resume_queues(struct kfd_process *p,
uint32_t num_queues,
uint32_t *usr_queue_id_array);
int debug_lock_and_unmap(struct device_queue_manager *dqm); int debug_lock_and_unmap(struct device_queue_manager *dqm);
int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_map_and_unlock(struct device_queue_manager *dqm);
int debug_refresh_runlist(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm);
......
...@@ -237,6 +237,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, ...@@ -237,6 +237,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
u32 *save_area_used_size) u32 *save_area_used_size)
{ {
struct v10_compute_mqd *m; struct v10_compute_mqd *m;
struct kfd_context_save_area_header header;
m = get_mqd(mqd); m = get_mqd(mqd);
...@@ -255,6 +256,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, ...@@ -255,6 +256,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
* accessible to user mode * accessible to user mode
*/ */
header.wave_state.control_stack_size = *ctl_stack_used_size;
header.wave_state.wave_state_size = *save_area_used_size;
header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset;
header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset;
if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state)))
return -EFAULT;
return 0; return 0;
} }
......
...@@ -291,7 +291,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, ...@@ -291,7 +291,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
u32 *save_area_used_size) u32 *save_area_used_size)
{ {
struct v11_compute_mqd *m; struct v11_compute_mqd *m;
/*struct mqd_user_context_save_area_header header;*/ struct kfd_context_save_area_header header;
m = get_mqd(mqd); m = get_mqd(mqd);
...@@ -309,16 +309,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, ...@@ -309,16 +309,15 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
* it's part of the context save area that is already * it's part of the context save area that is already
* accessible to user mode * accessible to user mode
*/ */
/* header.wave_state.control_stack_size = *ctl_stack_used_size;
header.control_stack_size = *ctl_stack_used_size; header.wave_state.wave_state_size = *save_area_used_size;
header.wave_state_size = *save_area_used_size;
header.wave_state_offset = m->cp_hqd_wg_state_offset; header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset;
header.control_stack_offset = m->cp_hqd_cntl_stack_offset; header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset;
if (copy_to_user(ctl_stack, &header, sizeof(header))) if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state)))
return -EFAULT; return -EFAULT;
*/
return 0; return 0;
} }
......
...@@ -311,6 +311,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, ...@@ -311,6 +311,7 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
u32 *save_area_used_size) u32 *save_area_used_size)
{ {
struct v9_mqd *m; struct v9_mqd *m;
struct kfd_context_save_area_header header;
/* Control stack is located one page after MQD. */ /* Control stack is located one page after MQD. */
void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE); void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
...@@ -322,7 +323,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd, ...@@ -322,7 +323,18 @@ static int get_wave_state(struct mqd_manager *mm, void *mqd,
*save_area_used_size = m->cp_hqd_wg_state_offset - *save_area_used_size = m->cp_hqd_wg_state_offset -
m->cp_hqd_cntl_stack_size; m->cp_hqd_cntl_stack_size;
if (copy_to_user(ctl_stack, mqd_ctl_stack, m->cp_hqd_cntl_stack_size)) header.wave_state.control_stack_size = *ctl_stack_used_size;
header.wave_state.wave_state_size = *save_area_used_size;
header.wave_state.wave_state_offset = m->cp_hqd_wg_state_offset;
header.wave_state.control_stack_offset = m->cp_hqd_cntl_stack_offset;
if (copy_to_user(ctl_stack, &header, sizeof(header.wave_state)))
return -EFAULT;
if (copy_to_user(ctl_stack + m->cp_hqd_cntl_stack_offset,
mqd_ctl_stack + m->cp_hqd_cntl_stack_offset,
*ctl_stack_used_size))
return -EFAULT; return -EFAULT;
return 0; return 0;
......
...@@ -510,6 +510,8 @@ struct queue_properties { ...@@ -510,6 +510,8 @@ struct queue_properties {
uint32_t doorbell_off; uint32_t doorbell_off;
bool is_interop; bool is_interop;
bool is_evicted; bool is_evicted;
bool is_suspended;
bool is_being_destroyed;
bool is_active; bool is_active;
bool is_gws; bool is_gws;
uint32_t pm4_target_xcc; uint32_t pm4_target_xcc;
...@@ -535,7 +537,8 @@ struct queue_properties { ...@@ -535,7 +537,8 @@ struct queue_properties {
#define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \ #define QUEUE_IS_ACTIVE(q) ((q).queue_size > 0 && \
(q).queue_address != 0 && \ (q).queue_address != 0 && \
(q).queue_percent > 0 && \ (q).queue_percent > 0 && \
!(q).is_evicted) !(q).is_evicted && \
!(q).is_suspended)
enum mqd_update_flag { enum mqd_update_flag {
UPDATE_FLAG_DBG_WA_ENABLE = 1, UPDATE_FLAG_DBG_WA_ENABLE = 1,
......
...@@ -187,6 +187,7 @@ static int init_user_queue(struct process_queue_manager *pqm, ...@@ -187,6 +187,7 @@ static int init_user_queue(struct process_queue_manager *pqm,
/* Doorbell initialized in user space*/ /* Doorbell initialized in user space*/
q_properties->doorbell_ptr = NULL; q_properties->doorbell_ptr = NULL;
q_properties->exception_status = KFD_EC_MASK(EC_QUEUE_NEW);
/* let DQM handle it*/ /* let DQM handle it*/
q_properties->vmid = 0; q_properties->vmid = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment