drm/amdkfd: Update interrupt handling for GFX9.4.3

Update interrupt handling in CPX mode for GFX9.4.3 by using the VMID space instead of SDMA client id to determine if an interrupt should be processed by a KFD node. This is especially needed for handling retry faults from MMHUB. Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdkfd: Update interrupt handling for GFX9.4.3
Update interrupt handling in CPX mode for GFX9.4.3 by using the VMID space instead of SDMA client id to determine if an interrupt should be processed by a KFD node. This is especially needed for handling retry faults from MMHUB. Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
f5fe7edf · Mukul Joshi · Alex Deucher · cb30544e · f5fe7edf · f5fe7edf
Commit f5fe7edf authored Sep 30, 2022 by Mukul Joshi Committed by Alex Deucher Jun 09, 2023
6 changed files
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2434,6 +2434,9 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 * amdgpu_vm_handle_fault - graceful handling of VM faults.
 * @adev: amdgpu device pointer
 * @pasid: PASID of the VM
+ * @vmid: VMID, only used for GFX 9.4.3.
+ * @node_id: Node_id received in IH cookie. Only applicable for
+ *           GFX 9.4.3.
 * @addr: Address of the fault
 * @write_fault: true is write fault, false is read fault
 *
@@ -2441,7 +2444,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
 * shouldn't be reported any more.
 */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			    u32 client_id, u32 node_id, uint64_t addr,
+			    u32 vmid, u32 node_id, uint64_t addr,
 			    bool write_fault)
 {
 	bool is_compute_context = false;
@@ -2466,7 +2469,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,

 	addr /= AMDGPU_GPU_PAGE_SIZE;

-	if (is_compute_context && !svm_range_restore_pages(adev, pasid, client_id,
+	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
 	    node_id, addr, write_fault)) {
 		amdgpu_bo_unref(&root);
 		return true;

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -455,7 +455,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
 void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
 			     struct amdgpu_task_info *task_info);
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-			    u32 client_id, u32 node_id, uint64_t addr,
+			    u32 vmid, u32 node_id, uint64_t addr,
 			    bool write_fault);

 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);

--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -587,7 +587,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,

 			cam_index = entry->src_data[2] & 0x3ff;

-			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
+			ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
 						     addr, write_fault);
 			WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
 			if (ret)
@@ -610,7 +610,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
 			/* Try to handle the recoverable page faults by filling page
 			 * tables
 			 */
-			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->client_id, node_id,
+			if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
 						   addr, write_fault))
 				return 1;
 		}

--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1073,18 +1073,14 @@ struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
 struct kfd_node *kfd_device_by_id(uint32_t gpu_id);
 struct kfd_node *kfd_device_by_pci_dev(const struct pci_dev *pdev);
 struct kfd_node *kfd_device_by_adev(const struct amdgpu_device *adev);
-static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t client_id,
-				     uint32_t node_id)
+static inline bool kfd_irq_is_from_node(struct kfd_node *node, uint32_t node_id,
+					uint32_t vmid)
 {
-	if ((node->interrupt_bitmap & (0x1U << node_id)) ||
-	    ((node_id % 4) == 0 &&
-	    (node->interrupt_bitmap >> 16) & (0x1U << client_id)))
-		return true;
-
-	return false;
+	return (node->interrupt_bitmap & (1 << node_id)) != 0 &&
+	       (node->compute_vmid_bitmap & (1 << vmid)) != 0;
 }
 static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
-					uint32_t client_id, uint32_t node_id) {
+					uint32_t node_id, uint32_t vmid) {
 	struct kfd_dev *dev = adev->kfd.dev;
 	uint32_t i;

@@ -1092,7 +1088,7 @@ static inline struct kfd_node *kfd_node_by_irq_ids(struct amdgpu_device *adev,
 		return dev->nodes[0];

 	for (i = 0; i < dev->num_nodes; i++)
-		if (kfd_irq_is_from_node(dev->nodes[i], client_id, node_id))
+		if (kfd_irq_is_from_node(dev->nodes[i], node_id, vmid))
 			return dev->nodes[i];

 	return NULL;

--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2799,7 +2799,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)

 int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
-			uint32_t client_id, uint32_t node_id,
+			uint32_t vmid, uint32_t node_id,
 			uint64_t addr, bool write_fault)
 {
 	struct mm_struct *mm = NULL;
@@ -2851,10 +2851,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
 		goto out;
 	}

-	node = kfd_node_by_irq_ids(adev, node_id, client_id);
+	node = kfd_node_by_irq_ids(adev, node_id, vmid);
 	if (!node) {
-		pr_debug("kfd node does not exist node_id: %d, client_id: %d\n", node_id,
-			 client_id);
+		pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
+			 vmid);
 		r = -EFAULT;
 		goto out;
 	}

--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -173,7 +173,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, struct mm_struct *mm,
 			       unsigned long addr, struct svm_range *parent,
 			       struct svm_range *prange);
 int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
-			    uint32_t client_id, uint32_t node_id, uint64_t addr,
+			    uint32_t vmid, uint32_t node_id, uint64_t addr,
 			    bool write_fault);
 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
 void svm_range_add_list_work(struct svm_range_list *svms,