habanalabs: add gaudi2 wait-for-CS support

In Gaudi2 we moved to a different wait for command submission completion model. Instead of receiving interrupt only on external queues, we use the device's sync manager to notify us when the entire command submission finishes. This enables us to remove the categorization of queues to external and internal, and treat each queue equally, without the need to parse and patch any command buffer. This change also requires refactoring to the IRQ handling of CS completions. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>

habanalabs: add gaudi2 wait-for-CS support
In Gaudi2 we moved to a different wait for command submission completion model. Instead of receiving interrupt only on external queues, we use the device's sync manager to notify us when the entire command submission finishes. This enables us to remove the categorization of queues to external and internal, and treat each queue equally, without the need to parse and patch any command buffer. This change also requires refactoring to the IRQ handling of CS completions. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
f73c6376 · Oded Gabbay · de88aa67 · f73c6376 · f73c6376 · f73c6376
Commit f73c6376 authored Jun 27, 2022 by Oded Gabbay
7 changed files
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -152,12 +152,12 @@ static int command_submission_show(struct seq_file *s, void *data)
 		if (first) {
 			first = false;
 			seq_puts(s, "\n");
-			seq_puts(s, " CS ID   CTX ASID   CS RefCnt   Submitted    Completed\n");
-			seq_puts(s, "------------------------------------------------------\n");
+			seq_puts(s, " CS ID   CS TYPE   CTX ASID   CS RefCnt   Submitted    Completed\n");
+			seq_puts(s, "----------------------------------------------------------------\n");
 		}
 		seq_printf(s,
-			"   %llu       %d          %d           %d            %d\n",
-			cs->sequence, cs->ctx->asid,
+			"   %llu        %d          %d          %d           %d            %d\n",
+			cs->sequence, cs->type, cs->ctx->asid,
 			kref_read(&cs->refcount),
 			cs->submitted, cs->completed);
 	}
@@ -183,17 +183,18 @@ static int command_submission_jobs_show(struct seq_file *s, void *data)
 		if (first) {
 			first = false;
 			seq_puts(s, "\n");
-			seq_puts(s, " JOB ID   CS ID    CTX ASID   JOB RefCnt   H/W Queue\n");
-			seq_puts(s, "----------------------------------------------------\n");
+			seq_puts(s, " JOB ID   CS ID    CS TYPE    CTX ASID   JOB RefCnt   H/W Queue\n");
+			seq_puts(s, "---------------------------------------------------------------\n");
 		}
 		if (job->cs)
 			seq_printf(s,
-				"   %02d      %llu        %d          %d           %d\n",
-				job->id, job->cs->sequence, job->cs->ctx->asid,
-				kref_read(&job->refcount), job->hw_queue_id);
+				"   %02d      %llu        %d        %d          %d           %d\n",
+				job->id, job->cs->sequence, job->cs->type,
+				job->cs->ctx->asid, kref_read(&job->refcount),
+				job->hw_queue_id);
 		else
 			seq_printf(s,
-				"   %02d      0        %d          %d           %d\n",
+				"   %02d      0        0        %d          %d           %d\n",
 				job->id, HL_KERNEL_ASID_ID,
 				kref_read(&job->refcount), job->hw_queue_id);
 	}

--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -15,14 +15,14 @@

 #define HL_RESET_DELAY_USEC		10000	/* 10ms */

-#define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788
-
 enum dma_alloc_type {
 	DMA_ALLOC_COHERENT,
 	DMA_ALLOC_CPU_ACCESSIBLE,
 	DMA_ALLOC_POOL,
 };

+#define MEM_SCRUB_DEFAULT_VAL 0x1122334455667788
+
 /*
 * hl_set_dram_bar- sets the bar to allow later access to address
 *
@@ -412,8 +412,8 @@ static int hl_device_release(struct inode *inode, struct file *filp)
 	 */
 	hl_release_pending_user_interrupts(hpriv->hdev);

-	hl_mem_mgr_fini(&hpriv->mem_mgr);
 	hl_ctx_mgr_fini(hdev, &hpriv->ctx_mgr);
+	hl_mem_mgr_fini(&hpriv->mem_mgr);

 	hdev->compute_ctx_in_release = 1;

@@ -461,7 +461,7 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 * @*filp: pointer to file structure
 * @*vma: pointer to vm_area_struct of the process
 *
- * Called when process does an mmap on habanalabs device. Call the device's mmap
+ * Called when process does an mmap on habanalabs device. Call the relevant mmap
 * function at the end of the common code.
 */
 static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
@@ -486,7 +486,6 @@ static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 	case HL_MMAP_TYPE_TS_BUFF:
 		return hl_mem_mgr_mmap(&hpriv->mem_mgr, vma, NULL);
 	}
-
 	return -EINVAL;
 }

@@ -686,12 +685,20 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_cq_wq;
 	}

+	hdev->cs_cmplt_wq = alloc_workqueue("hl-cs-completions", WQ_UNBOUND, 0);
+	if (!hdev->cs_cmplt_wq) {
+		dev_err(hdev->dev,
+			"Failed to allocate CS completions workqueue\n");
+		rc = -ENOMEM;
+		goto free_eq_wq;
+	}
+
 	hdev->ts_free_obj_wq = alloc_workqueue("hl-ts-free-obj", WQ_UNBOUND, 0);
 	if (!hdev->ts_free_obj_wq) {
 		dev_err(hdev->dev,
 			"Failed to allocate Timestamp registration free workqueue\n");
 		rc = -ENOMEM;
-		goto free_eq_wq;
+		goto free_cs_cmplt_wq;
 	}

 	hdev->pf_wq = alloc_workqueue("hl-prefetch", WQ_UNBOUND, 0);
@@ -748,6 +755,8 @@ static int device_early_init(struct hl_device *hdev)
 	destroy_workqueue(hdev->pf_wq);
 free_ts_free_wq:
 	destroy_workqueue(hdev->ts_free_obj_wq);
+free_cs_cmplt_wq:
+	destroy_workqueue(hdev->cs_cmplt_wq);
 free_eq_wq:
 	destroy_workqueue(hdev->eq_wq);
 free_cq_wq:
@@ -788,6 +797,7 @@ static void device_early_fini(struct hl_device *hdev)

 	destroy_workqueue(hdev->pf_wq);
 	destroy_workqueue(hdev->ts_free_obj_wq);
+	destroy_workqueue(hdev->cs_cmplt_wq);
 	destroy_workqueue(hdev->eq_wq);
 	destroy_workqueue(hdev->device_reset_work.wq);

@@ -1706,13 +1716,12 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	if (rc)
 		goto free_dev_ctrl;

-	user_interrupt_cnt = hdev->asic_prop.user_interrupt_count;
+	user_interrupt_cnt = hdev->asic_prop.user_dec_intr_count +
+				hdev->asic_prop.user_interrupt_count;

 	if (user_interrupt_cnt) {
-		hdev->user_interrupt = kcalloc(user_interrupt_cnt,
-				sizeof(*hdev->user_interrupt),
-				GFP_KERNEL);
-
+		hdev->user_interrupt = kcalloc(user_interrupt_cnt, sizeof(*hdev->user_interrupt),
+						GFP_KERNEL);
 		if (!hdev->user_interrupt) {
 			rc = -ENOMEM;
 			goto early_fini;
@@ -1725,7 +1734,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	 */
 	rc = hdev->asic_funcs->sw_init(hdev);
 	if (rc)
-		goto user_interrupts_fini;
+		goto free_usr_intr_mem;


 	/* initialize completion structure for multi CS wait */
@@ -1773,6 +1782,13 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		hdev->completion_queue[i].cq_idx = i;
 	}

+	hdev->shadow_cs_queue = kcalloc(hdev->asic_prop.max_pending_cs,
+					sizeof(*hdev->shadow_cs_queue), GFP_KERNEL);
+	if (!hdev->shadow_cs_queue) {
+		rc = -ENOMEM;
+		goto cq_fini;
+	}
+
 	/*
 	 * Initialize the event queue. Must be done before hw_init,
 	 * because there the address of the event queue is being
@@ -1781,7 +1797,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	rc = hl_eq_init(hdev, &hdev->event_queue);
 	if (rc) {
 		dev_err(hdev->dev, "failed to initialize event queue\n");
-		goto cq_fini;
+		goto free_shadow_cs_queue;
 	}

 	/* MMU S/W must be initialized before kernel context is created */
@@ -1932,6 +1948,8 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	hl_mmu_fini(hdev);
 eq_fini:
 	hl_eq_fini(hdev, &hdev->event_queue);
+free_shadow_cs_queue:
+	kfree(hdev->shadow_cs_queue);
 cq_fini:
 	for (i = 0 ; i < cq_ready_cnt ; i++)
 		hl_cq_fini(hdev, &hdev->completion_queue[i]);
@@ -1940,7 +1958,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 	hl_hw_queues_destroy(hdev);
 sw_fini:
 	hdev->asic_funcs->sw_fini(hdev);
-user_interrupts_fini:
+free_usr_intr_mem:
 	kfree(hdev->user_interrupt);
 early_fini:
 	device_early_fini(hdev);
@@ -2080,6 +2098,8 @@ void hl_device_fini(struct hl_device *hdev)

 	hl_eq_fini(hdev, &hdev->event_queue);

+	kfree(hdev->shadow_cs_queue);
+
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		hl_cq_fini(hdev, &hdev->completion_queue[i]);
 	kfree(hdev->completion_queue);

--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2893,9 +2893,13 @@ struct hl_reset_info {
 * @common_user_interrupt: common user interrupt for all user interrupts.
 *                         upon any user interrupt, driver will monitor the
 *                         list of fences registered to this common structure.
+ * @shadow_cs_queue: pointer to a shadow queue that holds pointers to
+ *                   outstanding command submissions.
 * @cq_wq: work queues of completion queues for executing work in process
 *         context.
 * @eq_wq: work queue of event queue for executing work in process context.
+ * @cs_cmplt_wq: work queue of CS completions for executing work in process
+ *               context.
 * @ts_free_obj_wq: work queue for timestamp registration objects release.
 * @pf_wq: work queue for MMU pre-fetch operations.
 * @kernel_ctx: Kernel driver context structure.
@@ -3053,8 +3057,10 @@ struct hl_device {
 	struct hl_cq			*completion_queue;
 	struct hl_user_interrupt	*user_interrupt;
 	struct hl_user_interrupt	common_user_interrupt;
+	struct hl_cs			**shadow_cs_queue;
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
+	struct workqueue_struct		*cs_cmplt_wq;
 	struct workqueue_struct		*ts_free_obj_wq;
 	struct workqueue_struct		*pf_wq;
 	struct hl_ctx			*kernel_ctx;

--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -696,6 +696,16 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 			goto unroll_cq_resv;
 	}

+	rc = hdev->asic_funcs->pre_schedule_cs(cs);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed in pre-submission operations of CS %d.%llu\n",
+			ctx->asid, cs->sequence);
+		goto unroll_cq_resv;
+	}
+
+	hdev->shadow_cs_queue[cs->sequence &
+				(hdev->asic_prop.max_pending_cs - 1)] = cs;

 	if (cs->encaps_signals && cs->staged_first) {
 		rc = encaps_sig_first_staged_cs_handler(hdev, cs);

--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -66,6 +66,56 @@ static void irq_handle_eqe(struct work_struct *work)
 	kfree(eqe_work);
 }

+/**
+ * job_finish - queue job finish work
+ *
+ * @hdev: pointer to device structure
+ * @cs_seq: command submission sequence
+ * @cq: completion queue
+ *
+ */
+static void job_finish(struct hl_device *hdev, u32 cs_seq, struct hl_cq *cq)
+{
+	struct hl_hw_queue *queue;
+	struct hl_cs_job *job;
+
+	queue = &hdev->kernel_queues[cq->hw_queue_id];
+	job = queue->shadow_queue[hl_pi_2_offset(cs_seq)];
+	queue_work(hdev->cq_wq[cq->cq_idx], &job->finish_work);
+
+	atomic_inc(&queue->ci);
+}
+
+/**
+ * cs_finish - queue all cs jobs finish work
+ *
+ * @hdev: pointer to device structure
+ * @cs_seq: command submission sequence
+ *
+ */
+static void cs_finish(struct hl_device *hdev, u16 cs_seq)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_hw_queue *queue;
+	struct hl_cs *cs;
+	struct hl_cs_job *job;
+
+	cs = hdev->shadow_cs_queue[cs_seq & (prop->max_pending_cs - 1)];
+	if (!cs) {
+		dev_warn(hdev->dev,
+			"No pointer to CS in shadow array at index %d\n",
+			cs_seq);
+		return;
+	}
+
+	list_for_each_entry(job, &cs->job_list, cs_node) {
+		queue = &hdev->kernel_queues[job->hw_queue_id];
+		atomic_inc(&queue->ci);
+	}
+
+	queue_work(hdev->cs_cmplt_wq, &cs->finish_work);
+}
+
 /**
 * hl_irq_handler_cq - irq handler for completion queue
 *
@@ -77,9 +127,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
 {
 	struct hl_cq *cq = arg;
 	struct hl_device *hdev = cq->hdev;
-	struct hl_hw_queue *queue;
-	struct hl_cs_job *job;
-	bool shadow_index_valid;
+	bool shadow_index_valid, entry_ready;
 	u16 shadow_index;
 	struct hl_cq_entry *cq_entry, *cq_base;

@@ -93,37 +141,41 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
 	cq_base = cq->kernel_address;

 	while (1) {
-		bool entry_ready = ((le32_to_cpu(cq_base[cq->ci].data) &
-					CQ_ENTRY_READY_MASK)
-						>> CQ_ENTRY_READY_SHIFT);
+		cq_entry = (struct hl_cq_entry *) &cq_base[cq->ci];

+		entry_ready = !!FIELD_GET(CQ_ENTRY_READY_MASK,
+				le32_to_cpu(cq_entry->data));
 		if (!entry_ready)
 			break;

-		cq_entry = (struct hl_cq_entry *) &cq_base[cq->ci];
-
 		/* Make sure we read CQ entry contents after we've
 		 * checked the ownership bit.
 		 */
 		dma_rmb();

-		shadow_index_valid = ((le32_to_cpu(cq_entry->data) &
-					CQ_ENTRY_SHADOW_INDEX_VALID_MASK)
-					>> CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT);
-
-		shadow_index = (u16) ((le32_to_cpu(cq_entry->data) &
-					CQ_ENTRY_SHADOW_INDEX_MASK)
-					>> CQ_ENTRY_SHADOW_INDEX_SHIFT);
+		shadow_index_valid =
+			!!FIELD_GET(CQ_ENTRY_SHADOW_INDEX_VALID_MASK,
+					le32_to_cpu(cq_entry->data));

-		queue = &hdev->kernel_queues[cq->hw_queue_id];
+		shadow_index = FIELD_GET(CQ_ENTRY_SHADOW_INDEX_MASK,
+				le32_to_cpu(cq_entry->data));

-		if ((shadow_index_valid) && (!hdev->disabled)) {
-			job = queue->shadow_queue[hl_pi_2_offset(shadow_index)];
-			queue_work(hdev->cq_wq[cq->cq_idx], &job->finish_work);
+		/*
+		 * CQ interrupt handler has 2 modes of operation:
+		 * 1. Interrupt per CS completion: (Single CQ for all queues)
+		 *    CQ entry represents a completed CS
+		 *
+		 * 2. Interrupt per CS job completion in queue: (CQ per queue)
+		 *    CQ entry represents a completed job in a certain queue
+		 */
+		if (shadow_index_valid && !hdev->disabled) {
+			if (hdev->asic_prop.completion_mode ==
+					HL_COMPLETION_MODE_CS)
+				cs_finish(hdev, shadow_index);
+			else
+				job_finish(hdev, shadow_index, cq);
 		}

-		atomic_inc(&queue->ci);
-
 		/* Clear CQ entry ready bit */
 		cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) &
 						~CQ_ENTRY_READY_MASK);

--- a/drivers/misc/habanalabs/common/security.c
+++ b/drivers/misc/habanalabs/common/security.c
@@ -44,7 +44,7 @@ static int hl_get_pb_block(struct hl_device *hdev, u32 mm_reg_addr,
 *
 */
 static int hl_unset_pb_in_block(struct hl_device *hdev, u32 reg_offset,
-		struct hl_block_glbl_sec *sgs_entry)
+				struct hl_block_glbl_sec *sgs_entry)
 {
 	if ((reg_offset >= HL_BLOCK_SIZE) || (reg_offset & 0x3)) {
 		dev_err(hdev->dev,