Commit 79b1894c authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

habanalabs: use queue pi/ci in order to determine queue occupancy

Instead of using the free slots amount on the compute CQ to determine
whether we can submit work to queues, use the queues pi/ci.

This is needed in future ASICs where we don't have CQ per queue.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 3abc99bb
...@@ -1144,14 +1144,17 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) ...@@ -1144,14 +1144,17 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
* because there the addresses of the completion queues are being * because there the addresses of the completion queues are being
* passed as arguments to request_irq * passed as arguments to request_irq
*/ */
hdev->completion_queue = kcalloc(cq_cnt, if (cq_cnt) {
sizeof(*hdev->completion_queue), hdev->completion_queue = kcalloc(cq_cnt,
GFP_KERNEL); sizeof(*hdev->completion_queue),
GFP_KERNEL);
if (!hdev->completion_queue) { if (!hdev->completion_queue) {
dev_err(hdev->dev, "failed to allocate completion queues\n"); dev_err(hdev->dev,
rc = -ENOMEM; "failed to allocate completion queues\n");
goto hw_queues_destroy; rc = -ENOMEM;
goto hw_queues_destroy;
}
} }
for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) { for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
......
...@@ -461,7 +461,7 @@ struct hl_hw_queue { ...@@ -461,7 +461,7 @@ struct hl_hw_queue {
u64 kernel_address; u64 kernel_address;
dma_addr_t bus_address; dma_addr_t bus_address;
u32 pi; u32 pi;
u32 ci; atomic_t ci;
u32 hw_queue_id; u32 hw_queue_id;
u32 cq_id; u32 cq_id;
u32 msi_vec; u32 msi_vec;
......
...@@ -23,10 +23,14 @@ inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val) ...@@ -23,10 +23,14 @@ inline u32 hl_hw_queue_add_ptr(u32 ptr, u16 val)
ptr &= ((HL_QUEUE_LENGTH << 1) - 1); ptr &= ((HL_QUEUE_LENGTH << 1) - 1);
return ptr; return ptr;
} }
static inline int queue_ci_get(atomic_t *ci, u32 queue_len)
{
return atomic_read(ci) & ((queue_len << 1) - 1);
}
static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len) static inline int queue_free_slots(struct hl_hw_queue *q, u32 queue_len)
{ {
int delta = (q->pi - q->ci); int delta = (q->pi - queue_ci_get(&q->ci, queue_len));
if (delta >= 0) if (delta >= 0)
return (queue_len - delta); return (queue_len - delta);
...@@ -40,21 +44,14 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs) ...@@ -40,21 +44,14 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
struct hl_hw_queue *q; struct hl_hw_queue *q;
int i; int i;
hdev->asic_funcs->hw_queues_lock(hdev);
if (hdev->disabled) if (hdev->disabled)
goto out; return;
q = &hdev->kernel_queues[0]; q = &hdev->kernel_queues[0];
for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) { for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) {
if (q->queue_type == QUEUE_TYPE_INT) { if (q->queue_type == QUEUE_TYPE_INT)
q->ci += cs->jobs_in_queue_cnt[i]; atomic_add(cs->jobs_in_queue_cnt[i], &q->ci);
q->ci &= ((q->int_queue_len << 1) - 1);
}
} }
out:
hdev->asic_funcs->hw_queues_unlock(hdev);
} }
/* /*
...@@ -174,38 +171,26 @@ static int int_queue_sanity_checks(struct hl_device *hdev, ...@@ -174,38 +171,26 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
} }
/* /*
* hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue. * hw_queue_sanity_checks() - Make sure we have enough space in the h/w queue
* @hdev: Pointer to hl_device structure. * @hdev: Pointer to hl_device structure.
* @q: Pointer to hl_hw_queue structure. * @q: Pointer to hl_hw_queue structure.
* @num_of_entries: How many entries to check for space. * @num_of_entries: How many entries to check for space.
* *
* Perform the following: * Notice: We do not reserve queue entries so this function mustn't be called
* - Make sure we have enough space in the completion queue. * more than once per CS for the same queue
* This check also ensures that there is enough space in the h/w queue, as
* both queues are of the same size.
* - Reserve space in the completion queue (needs to be reversed if there
* is a failure down the road before the actual submission of work).
* *
* Both operations are done using the "free_slots_cnt" field of the completion
* queue. The CI counters of the queue and the completion queue are not
* needed/used for the H/W queue type.
*/ */
static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q, static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
int num_of_entries) int num_of_entries)
{ {
atomic_t *free_slots = int free_slots_cnt;
&hdev->completion_queue[q->cq_id].free_slots_cnt;
/* /* Check we have enough space in the queue */
* Check we have enough space in the completion queue. free_slots_cnt = queue_free_slots(q, HL_QUEUE_LENGTH);
* Add -1 to counter (decrement) unless counter was already 0.
* In that case, CQ is full so we can't submit a new CB. if (free_slots_cnt < num_of_entries) {
* atomic_add_unless will return 0 if counter was already 0. dev_dbg(hdev->dev, "Queue %d doesn't have room for %d CBs\n",
*/ q->hw_queue_id, num_of_entries);
if (atomic_add_negative(num_of_entries * -1, free_slots)) {
dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
num_of_entries, q->hw_queue_id);
atomic_add(num_of_entries, free_slots);
return -EAGAIN; return -EAGAIN;
} }
...@@ -366,7 +351,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) ...@@ -366,7 +351,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
{ {
struct hl_device *hdev = job->cs->ctx->hdev; struct hl_device *hdev = job->cs->ctx->hdev;
struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id]; struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
struct hl_cq *cq;
u64 ptr; u64 ptr;
u32 offset, ctl, len; u32 offset, ctl, len;
...@@ -395,17 +379,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job) ...@@ -395,17 +379,6 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
else else
ptr = (u64) (uintptr_t) job->user_cb; ptr = (u64) (uintptr_t) job->user_cb;
/*
* No need to protect pi_offset because scheduling to the
* H/W queues is done under the scheduler mutex
*
* No need to check if CQ is full because it was already
* checked in hw_queue_sanity_checks
*/
cq = &hdev->completion_queue[q->cq_id];
cq->pi = hl_cq_inc_ptr(cq->pi);
ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr); ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
} }
...@@ -552,8 +525,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) ...@@ -552,8 +525,7 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
goto unroll_cq_resv; goto unroll_cq_resv;
} }
if (q->queue_type == QUEUE_TYPE_EXT || if (q->queue_type == QUEUE_TYPE_EXT)
q->queue_type == QUEUE_TYPE_HW)
cq_cnt++; cq_cnt++;
} }
} }
...@@ -605,9 +577,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs) ...@@ -605,9 +577,8 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
unroll_cq_resv: unroll_cq_resv:
q = &hdev->kernel_queues[0]; q = &hdev->kernel_queues[0];
for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) { for (i = 0 ; (i < max_queues) && (cq_cnt > 0) ; i++, q++) {
if ((q->queue_type == QUEUE_TYPE_EXT || if ((q->queue_type == QUEUE_TYPE_EXT) &&
q->queue_type == QUEUE_TYPE_HW) && (cs->jobs_in_queue_cnt[i])) {
cs->jobs_in_queue_cnt[i]) {
atomic_t *free_slots = atomic_t *free_slots =
&hdev->completion_queue[i].free_slots_cnt; &hdev->completion_queue[i].free_slots_cnt;
atomic_add(cs->jobs_in_queue_cnt[i], free_slots); atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
...@@ -631,7 +602,7 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id) ...@@ -631,7 +602,7 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
{ {
struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id]; struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
q->ci = hl_queue_inc_ptr(q->ci); atomic_inc(&q->ci);
} }
static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
...@@ -666,7 +637,7 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q, ...@@ -666,7 +637,7 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
} }
/* Make sure read/write pointers are initialized to start of queue */ /* Make sure read/write pointers are initialized to start of queue */
q->ci = 0; atomic_set(&q->ci, 0);
q->pi = 0; q->pi = 0;
return 0; return 0;
...@@ -700,7 +671,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) ...@@ -700,7 +671,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
q->kernel_address = (u64) (uintptr_t) p; q->kernel_address = (u64) (uintptr_t) p;
q->pi = 0; q->pi = 0;
q->ci = 0; atomic_set(&q->ci, 0);
return 0; return 0;
} }
...@@ -729,7 +700,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q) ...@@ -729,7 +700,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
q->kernel_address = (u64) (uintptr_t) p; q->kernel_address = (u64) (uintptr_t) p;
/* Make sure read/write pointers are initialized to start of queue */ /* Make sure read/write pointers are initialized to start of queue */
q->ci = 0; atomic_set(&q->ci, 0);
q->pi = 0; q->pi = 0;
return 0; return 0;
...@@ -931,7 +902,8 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset) ...@@ -931,7 +902,8 @@ void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset)
if ((!q->valid) || if ((!q->valid) ||
((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU))) ((!hard_reset) && (q->queue_type == QUEUE_TYPE_CPU)))
continue; continue;
q->pi = q->ci = 0; q->pi = 0;
atomic_set(&q->ci, 0);
if (q->supports_sync_stream) if (q->supports_sync_stream)
sync_stream_queue_reset(hdev, q->hw_queue_id); sync_stream_queue_reset(hdev, q->hw_queue_id);
......
...@@ -122,12 +122,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg) ...@@ -122,12 +122,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
queue_work(hdev->cq_wq, &job->finish_work); queue_work(hdev->cq_wq, &job->finish_work);
} }
/* Update ci of the context's queue. There is no atomic_inc(&queue->ci);
* need to protect it with spinlock because this update is
* done only inside IRQ and there is a different IRQ per
* queue
*/
queue->ci = hl_queue_inc_ptr(queue->ci);
/* Clear CQ entry ready bit */ /* Clear CQ entry ready bit */
cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) & cq_entry->data = cpu_to_le32(le32_to_cpu(cq_entry->data) &
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment