Commit e42051d2 authored by Shaoyun Liu's avatar Shaoyun Liu Committed by Oded Gabbay

drm/amdkfd: Implement GPU reset handlers in KFD

Lock KFD and evict existing queues on reset. Notify user mode by
signaling hw_exception events.
Signed-off-by: default avatarShaoyun Liu <Shaoyun.Liu@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 5c6dd71e
...@@ -122,6 +122,9 @@ static int kfd_open(struct inode *inode, struct file *filep) ...@@ -122,6 +122,9 @@ static int kfd_open(struct inode *inode, struct file *filep)
if (IS_ERR(process)) if (IS_ERR(process))
return PTR_ERR(process); return PTR_ERR(process);
if (kfd_is_locked())
return -EAGAIN;
dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n", dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - %d\n",
process->pasid, process->is_32bit_user_mode); process->pasid, process->is_32bit_user_mode);
......
...@@ -30,7 +30,13 @@ ...@@ -30,7 +30,13 @@
#include "kfd_iommu.h" #include "kfd_iommu.h"
#define MQD_SIZE_ALIGNED 768 #define MQD_SIZE_ALIGNED 768
static atomic_t kfd_device_suspended = ATOMIC_INIT(0);
/*
* kfd_locked is used to lock the kfd driver during suspend or reset
* once locked, kfd driver will stop any further GPU execution.
* create process (open) will return -EAGAIN.
*/
static atomic_t kfd_locked = ATOMIC_INIT(0);
#ifdef KFD_SUPPORT_IOMMU_V2 #ifdef KFD_SUPPORT_IOMMU_V2
static const struct kfd_device_info kaveri_device_info = { static const struct kfd_device_info kaveri_device_info = {
...@@ -516,12 +522,43 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) ...@@ -516,12 +522,43 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
int kgd2kfd_pre_reset(struct kfd_dev *kfd) int kgd2kfd_pre_reset(struct kfd_dev *kfd)
{ {
if (!kfd->init_complete)
return 0;
kgd2kfd_suspend(kfd);
/* hold dqm->lock to prevent further execution*/
dqm_lock(kfd->dqm);
kfd_signal_reset_event(kfd);
return 0; return 0;
} }
/*
* Fix me. KFD won't be able to resume existing process for now.
* We will keep all existing process in a evicted state and
* wait the process to be terminated.
*/
int kgd2kfd_post_reset(struct kfd_dev *kfd) int kgd2kfd_post_reset(struct kfd_dev *kfd)
{ {
int ret, count;
if (!kfd->init_complete)
return 0; return 0;
dqm_unlock(kfd->dqm);
ret = kfd_resume(kfd);
if (ret)
return ret;
count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count != 0, "KFD reset ref. error");
return 0;
}
bool kfd_is_locked(void)
{
return (atomic_read(&kfd_locked) > 0);
} }
void kgd2kfd_suspend(struct kfd_dev *kfd) void kgd2kfd_suspend(struct kfd_dev *kfd)
...@@ -530,7 +567,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) ...@@ -530,7 +567,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
return; return;
/* For first KFD device suspend all the KFD processes */ /* For first KFD device suspend all the KFD processes */
if (atomic_inc_return(&kfd_device_suspended) == 1) if (atomic_inc_return(&kfd_locked) == 1)
kfd_suspend_all_processes(); kfd_suspend_all_processes();
kfd->dqm->ops.stop(kfd->dqm); kfd->dqm->ops.stop(kfd->dqm);
...@@ -549,7 +586,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd) ...@@ -549,7 +586,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
if (ret) if (ret)
return ret; return ret;
count = atomic_dec_return(&kfd_device_suspended); count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); WARN_ONCE(count < 0, "KFD suspend / resume ref. error");
if (count == 0) if (count == 0)
ret = kfd_resume_all_processes(); ret = kfd_resume_all_processes();
......
...@@ -1000,3 +1000,30 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, ...@@ -1000,3 +1000,30 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
mutex_unlock(&p->event_mutex); mutex_unlock(&p->event_mutex);
kfd_unref_process(p); kfd_unref_process(p);
} }
void kfd_signal_reset_event(struct kfd_dev *dev)
{
struct kfd_hsa_hw_exception_data hw_exception_data;
struct kfd_process *p;
struct kfd_event *ev;
unsigned int temp;
uint32_t id, idx;
/* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1;
idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
idr_for_each_entry_continue(&p->event_idr, ev, id)
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data;
set_event(ev);
}
mutex_unlock(&p->event_mutex);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
}
...@@ -66,6 +66,7 @@ struct kfd_event { ...@@ -66,6 +66,7 @@ struct kfd_event {
/* type specific data */ /* type specific data */
union { union {
struct kfd_hsa_memory_exception_data memory_exception_data; struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_hsa_hw_exception_data hw_exception_data;
}; };
}; };
......
...@@ -975,10 +975,14 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id); ...@@ -975,10 +975,14 @@ int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid, void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
struct kfd_vm_fault_info *info); struct kfd_vm_fault_info *info);
void kfd_signal_reset_event(struct kfd_dev *dev);
void kfd_flush_tlb(struct kfd_process_device *pdd); void kfd_flush_tlb(struct kfd_process_device *pdd);
int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p); int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
bool kfd_is_locked(void);
/* Debugfs */ /* Debugfs */
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment