Commit c6a6e2db authored by Andrey Grodzovsky's avatar Andrey Grodzovsky Committed by Alex Deucher

drm/amdgpu: Redo XGMI reset synchronization.

Use task barrier in XGMI hive to synchronize ASIC resets
across devices in XGMI hive.

v2: Return right away with a warning if no xgmi hive, update doc.
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Reviewed-by: default avatarLe Ma <Le.Ma@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f33a8770
...@@ -66,6 +66,7 @@ ...@@ -66,6 +66,7 @@
#include "amdgpu_pmu.h" #include "amdgpu_pmu.h"
#include <linux/suspend.h> #include <linux/suspend.h>
#include <drm/task_barrier.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
...@@ -2664,14 +2665,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) ...@@ -2664,14 +2665,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{ {
struct amdgpu_device *adev = struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work); container_of(__work, struct amdgpu_device, xgmi_reset_work);
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) /* It's a bug to not have a hive within this function */
adev->asic_reset_res = (adev->in_baco == false) ? if (WARN_ON(!hive))
amdgpu_device_baco_enter(adev->ddev) : return;
amdgpu_device_baco_exit(adev->ddev);
else /*
adev->asic_reset_res = amdgpu_asic_reset(adev); * Use task barrier to synchronize all xgmi reset works across the
* hive. task_barrier_enter and task_barrier_exit will block
* until all the threads running the xgmi reset works reach
* those points. task_barrier_full will do both blocks.
*/
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
task_barrier_enter(&hive->tb);
adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
if (adev->asic_reset_res)
goto fail;
task_barrier_exit(&hive->tb);
adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
if (adev->asic_reset_res)
goto fail;
} else {
task_barrier_full(&hive->tb);
adev->asic_reset_res = amdgpu_asic_reset(adev);
}
fail:
if (adev->asic_reset_res) if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s", DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique); adev->asic_reset_res, adev->ddev->unique);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment