Commit 6049db43 authored by Dennis Li's avatar Dennis Li Committed by Alex Deucher

drm/amdgpu: change reset lock from mutex to rw_semaphore

clients don't need reset-lock for synchronization when no
GPU recovery.

v2:
change to return the return value of down_read_killable.

v3:
if GPU recovery begin, VF ignore FLR notification.
Reviewed-by: default avatarMonk Liu <monk.liu@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarDennis Li <Dennis.Li@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 66b8a9c0
...@@ -951,7 +951,7 @@ struct amdgpu_device { ...@@ -951,7 +951,7 @@ struct amdgpu_device {
atomic_t in_gpu_reset; atomic_t in_gpu_reset;
enum pp_mp1_state mp1_state; enum pp_mp1_state mp1_state;
struct mutex lock_reset; struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index; struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock; struct mutex notifier_lock;
......
...@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) ...@@ -101,14 +101,18 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
file->private_data = adev; file->private_data = adev;
mutex_lock(&adev->lock_reset); ret = down_read_killable(&adev->reset_sem);
if (ret)
return ret;
if (adev->autodump.dumping.done) { if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping); reinit_completion(&adev->autodump.dumping);
ret = 0; ret = 0;
} else { } else {
ret = -EBUSY; ret = -EBUSY;
} }
mutex_unlock(&adev->lock_reset);
up_read(&adev->reset_sem);
return ret; return ret;
} }
...@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) ...@@ -1242,7 +1246,9 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
} }
/* Avoid accidently unparking the sched thread during GPU reset */ /* Avoid accidently unparking the sched thread during GPU reset */
mutex_lock(&adev->lock_reset); r = down_read_killable(&adev->reset_sem);
if (r)
return r;
/* hold on the scheduler */ /* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) { for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
...@@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data) ...@@ -1269,7 +1275,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread); kthread_unpark(ring->sched.thread);
} }
mutex_unlock(&adev->lock_reset); up_read(&adev->reset_sem);
pm_runtime_mark_last_busy(dev->dev); pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev); pm_runtime_put_autosuspend(dev->dev);
...@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) ...@@ -1459,7 +1465,9 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM; return -ENOMEM;
/* Avoid accidently unparking the sched thread during GPU reset */ /* Avoid accidently unparking the sched thread during GPU reset */
mutex_lock(&adev->lock_reset); r = down_read_killable(&adev->reset_sem);
if (r)
goto pro_end;
/* stop the scheduler */ /* stop the scheduler */
kthread_park(ring->sched.thread); kthread_park(ring->sched.thread);
...@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val) ...@@ -1500,13 +1508,14 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */ /* restart the scheduler */
kthread_unpark(ring->sched.thread); kthread_unpark(ring->sched.thread);
mutex_unlock(&adev->lock_reset); up_read(&adev->reset_sem);
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched); ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
pro_end:
kfree(fences); kfree(fences);
return 0; return r;
} }
static int amdgpu_debugfs_sclk_set(void *data, u64 val) static int amdgpu_debugfs_sclk_set(void *data, u64 val)
......
...@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -3054,7 +3054,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->virt.vf_errors.lock); mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash); hash_init(adev->mn_hash);
atomic_set(&adev->in_gpu_reset, 0); atomic_set(&adev->in_gpu_reset, 0);
mutex_init(&adev->lock_reset); init_rwsem(&adev->reset_sem);
mutex_init(&adev->psp.mutex); mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock); mutex_init(&adev->notifier_lock);
...@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev) ...@@ -4206,7 +4206,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev)
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0) if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
return false; return false;
mutex_lock(&adev->lock_reset); down_write(&adev->reset_sem);
atomic_inc(&adev->gpu_reset_counter); atomic_inc(&adev->gpu_reset_counter);
switch (amdgpu_asic_reset_method(adev)) { switch (amdgpu_asic_reset_method(adev)) {
...@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) ...@@ -4229,7 +4229,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
amdgpu_vf_error_trans_all(adev); amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE; adev->mp1_state = PP_MP1_STATE_NONE;
atomic_set(&adev->in_gpu_reset, 0); atomic_set(&adev->in_gpu_reset, 0);
mutex_unlock(&adev->lock_reset); up_write(&adev->reset_sem);
} }
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev) static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
......
...@@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) ...@@ -238,19 +238,15 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT; int timeout = AI_MAILBOX_POLL_FLR_TIMEDOUT;
int locked;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received, /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by * otherwise the mailbox msg will be ruined/reseted by
* the VF FLR. * the VF FLR.
*
* we can unlock the lock_reset to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR.
*/ */
locked = mutex_trylock(&adev->lock_reset); if (!down_read_trylock(&adev->reset_sem))
if (locked) return;
atomic_set(&adev->in_gpu_reset, 1);
atomic_set(&adev->in_gpu_reset, 1);
do { do {
if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) if (xgpu_ai_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
...@@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) ...@@ -261,10 +257,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1); } while (timeout > 1);
flr_done: flr_done:
if (locked) { atomic_set(&adev->in_gpu_reset, 0);
atomic_set(&adev->in_gpu_reset, 0); up_read(&adev->reset_sem);
mutex_unlock(&adev->lock_reset);
}
/* Trigger recovery for world switch failure if no TDR */ /* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev) if (amdgpu_device_should_recover_gpu(adev)
......
...@@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) ...@@ -259,19 +259,15 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work); struct amdgpu_virt *virt = container_of(work, struct amdgpu_virt, flr_work);
struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt); struct amdgpu_device *adev = container_of(virt, struct amdgpu_device, virt);
int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT; int timeout = NV_MAILBOX_POLL_FLR_TIMEDOUT;
int locked;
/* block amdgpu_gpu_recover till msg FLR COMPLETE received, /* block amdgpu_gpu_recover till msg FLR COMPLETE received,
* otherwise the mailbox msg will be ruined/reseted by * otherwise the mailbox msg will be ruined/reseted by
* the VF FLR. * the VF FLR.
*
* we can unlock the lock_reset to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR.
*/ */
locked = mutex_trylock(&adev->lock_reset); if (!down_read_trylock(&adev->reset_sem))
if (locked) return;
atomic_set(&adev->in_gpu_reset, 1);
atomic_set(&adev->in_gpu_reset, 1);
do { do {
if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL) if (xgpu_nv_mailbox_peek_msg(adev) == IDH_FLR_NOTIFICATION_CMPL)
...@@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) ...@@ -282,10 +278,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
} while (timeout > 1); } while (timeout > 1);
flr_done: flr_done:
if (locked) { atomic_set(&adev->in_gpu_reset, 0);
atomic_set(&adev->in_gpu_reset, 0); up_read(&adev->reset_sem);
mutex_unlock(&adev->lock_reset);
}
/* Trigger recovery for world switch failure if no TDR */ /* Trigger recovery for world switch failure if no TDR */
if (amdgpu_device_should_recover_gpu(adev) if (amdgpu_device_should_recover_gpu(adev)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment