Commit dbe2c4c8 authored by Eric Huang's avatar Eric Huang Committed by Alex Deucher

drm/amdkfd: add reset cause in gpu pre-reset smi event

reset cause is requested by customer as additional
info for gpu reset smi event.

v2: integerate reset sources suggested by Lijo Lazar
Signed-off-by: default avatarEric Huang <jinhuieric.huang@amd.com>
Reviewed-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3c7758be
...@@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work) ...@@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE; reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev; reset_context.reset_req_dev = adev;
reset_context.src = adev->enable_mes ?
AMDGPU_RESET_SRC_MES :
AMDGPU_RESET_SRC_HWS;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context); amdgpu_device_gpu_recover(adev, NULL, &reset_context);
...@@ -261,12 +264,13 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm) ...@@ -261,12 +264,13 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
return r; return r;
} }
int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev) int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
struct amdgpu_reset_context *reset_context)
{ {
int r = 0; int r = 0;
if (adev->kfd.dev) if (adev->kfd.dev)
r = kgd2kfd_pre_reset(adev->kfd.dev); r = kgd2kfd_pre_reset(adev->kfd.dev, reset_context);
return r; return r;
} }
......
...@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE { ...@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
}; };
struct amdgpu_device; struct amdgpu_device;
struct amdgpu_reset_context;
enum kfd_mem_attachment_type { enum kfd_mem_attachment_type {
KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */ KFD_MEM_ATT_SHARED, /* Share kgd_mem->bo or another attachment's */
...@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev); ...@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev); int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
struct amdgpu_reset_context *reset_context);
int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
...@@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, ...@@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
void kgd2kfd_device_exit(struct kfd_dev *kfd); void kgd2kfd_device_exit(struct kfd_dev *kfd);
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm); int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_pre_reset(struct kfd_dev *kfd); int kgd2kfd_pre_reset(struct kfd_dev *kfd,
struct amdgpu_reset_context *reset_context);
int kgd2kfd_post_reset(struct kfd_dev *kfd); int kgd2kfd_post_reset(struct kfd_dev *kfd);
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
...@@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) ...@@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
return 0; return 0;
} }
static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd) static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
struct amdgpu_reset_context *reset_context)
{ {
return 0; return 0;
} }
......
...@@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
cancel_delayed_work_sync(&tmp_adev->delayed_init_work); cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
amdgpu_amdkfd_pre_reset(tmp_adev); amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
/* /*
* Mark these ASICs to be reseted as untracked first * Mark these ASICs to be reseted as untracked first
......
...@@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) ...@@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
kfree(kfd); kfree(kfd);
} }
int kgd2kfd_pre_reset(struct kfd_dev *kfd) int kgd2kfd_pre_reset(struct kfd_dev *kfd,
struct amdgpu_reset_context *reset_context)
{ {
struct kfd_node *node; struct kfd_node *node;
int i; int i;
...@@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) ...@@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
for (i = 0; i < kfd->num_nodes; i++) { for (i = 0; i < kfd->num_nodes; i++) {
node = kfd->nodes[i]; node = kfd->nodes[i];
kfd_smi_event_update_gpu_reset(node, false); kfd_smi_event_update_gpu_reset(node, false, reset_context);
node->dqm->ops.pre_reset(node->dqm); node->dqm->ops.pre_reset(node->dqm);
} }
...@@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ...@@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
for (i = 0; i < kfd->num_nodes; i++) { for (i = 0; i < kfd->num_nodes; i++) {
node = kfd->nodes[i]; node = kfd->nodes[i];
atomic_set(&node->sram_ecc_flag, 0); atomic_set(&node->sram_ecc_flag, 0);
kfd_smi_event_update_gpu_reset(node, true); kfd_smi_event_update_gpu_reset(node, true, NULL);
} }
return 0; return 0;
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "amdgpu_vm.h" #include "amdgpu_vm.h"
#include "kfd_priv.h" #include "kfd_priv.h"
#include "kfd_smi_events.h" #include "kfd_smi_events.h"
#include "amdgpu_reset.h"
struct kfd_smi_client { struct kfd_smi_client {
struct list_head list; struct list_head list;
...@@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev, ...@@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev,
add_event_to_kfifo(pid, dev, event, fifo_in, len); add_event_to_kfifo(pid, dev, event, fifo_in, len);
} }
void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
struct amdgpu_reset_context *reset_context)
{ {
unsigned int event; unsigned int event;
char reset_cause[64];
if (post_reset) { if (post_reset) {
event = KFD_SMI_EVENT_GPU_POST_RESET; event = KFD_SMI_EVENT_GPU_POST_RESET;
...@@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset) ...@@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
event = KFD_SMI_EVENT_GPU_PRE_RESET; event = KFD_SMI_EVENT_GPU_PRE_RESET;
++(dev->reset_seq_num); ++(dev->reset_seq_num);
} }
kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);
memset(reset_cause, 0, sizeof(reset_cause));
if (reset_context)
amdgpu_reset_get_desc(reset_context, reset_cause,
sizeof(reset_cause));
kfd_smi_event_add(0, dev, event, "%x %s\n",
dev->reset_seq_num,
reset_cause);
} }
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
......
...@@ -24,11 +24,14 @@ ...@@ -24,11 +24,14 @@
#ifndef KFD_SMI_EVENTS_H_INCLUDED #ifndef KFD_SMI_EVENTS_H_INCLUDED
#define KFD_SMI_EVENTS_H_INCLUDED #define KFD_SMI_EVENTS_H_INCLUDED
struct amdgpu_reset_context;
int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd); int kfd_smi_event_open(struct kfd_node *dev, uint32_t *fd);
void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid); void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t pasid);
void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev, void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
uint64_t throttle_bitmask); uint64_t throttle_bitmask);
void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset); void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
struct amdgpu_reset_context *reset_context);
void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid, void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t pid,
unsigned long address, bool write_fault, unsigned long address, bool write_fault,
ktime_t ts); ktime_t ts);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment