Commit a98d73c7 authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

habanalabs: Replace dma-fence mechanism with completions

habanalabs driver uses dma-fence mechanism for synchronization.
dma-fence mechanism was designed solely for GPUs, hence we purpose
a simpler mechanism based on completions to replace current
dma-fence objects.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent b71590ef
...@@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref) ...@@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref)
hw_sob->q_idx, hw_sob->sob_id); hw_sob->q_idx, hw_sob->sob_id);
} }
static const char *hl_fence_get_driver_name(struct dma_fence *fence) static void hl_fence_release(struct kref *kref)
{
return "HabanaLabs";
}
static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
{
struct hl_cs_compl *hl_cs_compl =
container_of(fence, struct hl_cs_compl, base_fence);
return dev_name(hl_cs_compl->hdev->dev);
}
static bool hl_fence_enable_signaling(struct dma_fence *fence)
{
return true;
}
static void hl_fence_release(struct dma_fence *fence)
{ {
struct hl_fence *fence =
container_of(kref, struct hl_fence, refcount);
struct hl_cs_compl *hl_cs_cmpl = struct hl_cs_compl *hl_cs_cmpl =
container_of(fence, struct hl_cs_compl, base_fence); container_of(fence, struct hl_cs_compl, base_fence);
struct hl_device *hdev = hl_cs_cmpl->hdev; struct hl_device *hdev = hl_cs_cmpl->hdev;
...@@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence) ...@@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence)
} }
free: free:
kfree_rcu(hl_cs_cmpl, base_fence.rcu); kfree(hl_cs_cmpl);
} }
static const struct dma_fence_ops hl_fence_ops = { void hl_fence_put(struct hl_fence *fence)
.get_driver_name = hl_fence_get_driver_name, {
.get_timeline_name = hl_fence_get_timeline_name, if (fence)
.enable_signaling = hl_fence_enable_signaling, kref_put(&fence->refcount, hl_fence_release);
.release = hl_fence_release }
};
void hl_fence_get(struct hl_fence *fence)
{
if (fence)
kref_get(&fence->refcount);
}
static void hl_fence_init(struct hl_fence *fence)
{
kref_init(&fence->refcount);
fence->error = 0;
init_completion(&fence->completion);
}
static void cs_get(struct hl_cs *cs) static void cs_get(struct hl_cs *cs)
{ {
...@@ -336,7 +332,7 @@ static void cs_do_release(struct kref *ref) ...@@ -336,7 +332,7 @@ static void cs_do_release(struct kref *ref)
* In case the wait for signal CS was submitted, the put occurs * In case the wait for signal CS was submitted, the put occurs
* in init_signal_wait_cs() right before hanging on the PQ. * in init_signal_wait_cs() right before hanging on the PQ.
*/ */
dma_fence_put(cs->signal_fence); hl_fence_put(cs->signal_fence);
} }
/* /*
...@@ -348,19 +344,18 @@ static void cs_do_release(struct kref *ref) ...@@ -348,19 +344,18 @@ static void cs_do_release(struct kref *ref)
hl_ctx_put(cs->ctx); hl_ctx_put(cs->ctx);
/* We need to mark an error for not submitted because in that case /* We need to mark an error for not submitted because in that case
* the dma fence release flow is different. Mainly, we don't need * the hl fence release flow is different. Mainly, we don't need
* to handle hw_sob for signal/wait * to handle hw_sob for signal/wait
*/ */
if (cs->timedout) if (cs->timedout)
dma_fence_set_error(cs->fence, -ETIMEDOUT); cs->fence->error = -ETIMEDOUT;
else if (cs->aborted) else if (cs->aborted)
dma_fence_set_error(cs->fence, -EIO); cs->fence->error = -EIO;
else if (!cs->submitted) else if (!cs->submitted)
dma_fence_set_error(cs->fence, -EBUSY); cs->fence->error = -EBUSY;
dma_fence_signal(cs->fence);
dma_fence_put(cs->fence);
complete_all(&cs->fence->completion);
hl_fence_put(cs->fence);
cs_counters_aggregate(hdev, cs->ctx); cs_counters_aggregate(hdev, cs->ctx);
kfree(cs->jobs_in_queue_cnt); kfree(cs->jobs_in_queue_cnt);
...@@ -401,7 +396,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -401,7 +396,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
enum hl_cs_type cs_type, struct hl_cs **cs_new) enum hl_cs_type cs_type, struct hl_cs **cs_new)
{ {
struct hl_cs_compl *cs_cmpl; struct hl_cs_compl *cs_cmpl;
struct dma_fence *other = NULL; struct hl_fence *other = NULL;
struct hl_cs *cs; struct hl_cs *cs;
int rc; int rc;
...@@ -434,7 +429,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -434,7 +429,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
cs_cmpl->cs_seq = ctx->cs_sequence; cs_cmpl->cs_seq = ctx->cs_sequence;
other = ctx->cs_pending[cs_cmpl->cs_seq & other = ctx->cs_pending[cs_cmpl->cs_seq &
(hdev->asic_prop.max_pending_cs - 1)]; (hdev->asic_prop.max_pending_cs - 1)];
if ((other) && (!dma_fence_is_signaled(other))) {
if (other && !completion_done(&other->completion)) {
dev_dbg(hdev->dev, dev_dbg(hdev->dev,
"Rejecting CS because of too many in-flights CS\n"); "Rejecting CS because of too many in-flights CS\n");
rc = -EAGAIN; rc = -EAGAIN;
...@@ -448,8 +444,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -448,8 +444,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
goto free_fence; goto free_fence;
} }
dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock, /* init hl_fence */
ctx->asid, ctx->cs_sequence); hl_fence_init(&cs_cmpl->base_fence);
cs->sequence = cs_cmpl->cs_seq; cs->sequence = cs_cmpl->cs_seq;
...@@ -458,9 +454,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -458,9 +454,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
&cs_cmpl->base_fence; &cs_cmpl->base_fence;
ctx->cs_sequence++; ctx->cs_sequence++;
dma_fence_get(&cs_cmpl->base_fence); hl_fence_get(&cs_cmpl->base_fence);
dma_fence_put(other); hl_fence_put(other);
spin_unlock(&ctx->cs_lock); spin_unlock(&ctx->cs_lock);
...@@ -773,7 +769,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, ...@@ -773,7 +769,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
struct hl_ctx *ctx = hpriv->ctx; struct hl_ctx *ctx = hpriv->ctx;
struct hl_cs_chunk *cs_chunk_array, *chunk; struct hl_cs_chunk *cs_chunk_array, *chunk;
struct hw_queue_properties *hw_queue_prop; struct hw_queue_properties *hw_queue_prop;
struct dma_fence *sig_fence = NULL; struct hl_fence *sig_fence = NULL;
struct hl_cs_job *job; struct hl_cs_job *job;
struct hl_cs *cs; struct hl_cs *cs;
struct hl_cb *cb; struct hl_cb *cb;
...@@ -883,14 +879,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, ...@@ -883,14 +879,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
dev_err(hdev->dev, dev_err(hdev->dev,
"CS seq 0x%llx is not of a signal CS\n", "CS seq 0x%llx is not of a signal CS\n",
signal_seq); signal_seq);
dma_fence_put(sig_fence); hl_fence_put(sig_fence);
rc = -EINVAL; rc = -EINVAL;
goto free_signal_seq_array; goto free_signal_seq_array;
} }
if (dma_fence_is_signaled(sig_fence)) { if (completion_done(&sig_fence->completion)) {
/* signal CS already finished */ /* signal CS already finished */
dma_fence_put(sig_fence); hl_fence_put(sig_fence);
rc = 0; rc = 0;
goto free_signal_seq_array; goto free_signal_seq_array;
} }
...@@ -902,7 +898,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, ...@@ -902,7 +898,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
rc = allocate_cs(hdev, ctx, cs_type, &cs); rc = allocate_cs(hdev, ctx, cs_type, &cs);
if (rc) { if (rc) {
if (cs_type == CS_TYPE_WAIT) if (cs_type == CS_TYPE_WAIT)
dma_fence_put(sig_fence); hl_fence_put(sig_fence);
hl_ctx_put(ctx); hl_ctx_put(ctx);
goto free_signal_seq_array; goto free_signal_seq_array;
} }
...@@ -1162,7 +1158,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) ...@@ -1162,7 +1158,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
static long _hl_cs_wait_ioctl(struct hl_device *hdev, static long _hl_cs_wait_ioctl(struct hl_device *hdev,
struct hl_ctx *ctx, u64 timeout_us, u64 seq) struct hl_ctx *ctx, u64 timeout_us, u64 seq)
{ {
struct dma_fence *fence; struct hl_fence *fence;
unsigned long timeout; unsigned long timeout;
long rc; long rc;
...@@ -1181,12 +1177,15 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, ...@@ -1181,12 +1177,15 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
"Can't wait on CS %llu because current CS is at seq %llu\n", "Can't wait on CS %llu because current CS is at seq %llu\n",
seq, ctx->cs_sequence); seq, ctx->cs_sequence);
} else if (fence) { } else if (fence) {
rc = dma_fence_wait_timeout(fence, true, timeout); rc = wait_for_completion_interruptible_timeout(
&fence->completion, timeout);
if (fence->error == -ETIMEDOUT) if (fence->error == -ETIMEDOUT)
rc = -ETIMEDOUT; rc = -ETIMEDOUT;
else if (fence->error == -EIO) else if (fence->error == -EIO)
rc = -EIO; rc = -EIO;
dma_fence_put(fence);
hl_fence_put(fence);
} else { } else {
dev_dbg(hdev->dev, dev_dbg(hdev->dev,
"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
......
...@@ -23,7 +23,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx) ...@@ -23,7 +23,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
*/ */
for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++) for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
dma_fence_put(ctx->cs_pending[i]); hl_fence_put(ctx->cs_pending[i]);
kfree(ctx->cs_pending); kfree(ctx->cs_pending);
...@@ -128,7 +128,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) ...@@ -128,7 +128,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
atomic_set(&ctx->thread_ctx_switch_token, 1); atomic_set(&ctx->thread_ctx_switch_token, 1);
ctx->thread_ctx_switch_wait_token = 0; ctx->thread_ctx_switch_wait_token = 0;
ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs, ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
sizeof(struct dma_fence *), sizeof(struct hl_fence *),
GFP_KERNEL); GFP_KERNEL);
if (!ctx->cs_pending) if (!ctx->cs_pending)
return -ENOMEM; return -ENOMEM;
...@@ -184,10 +184,10 @@ int hl_ctx_put(struct hl_ctx *ctx) ...@@ -184,10 +184,10 @@ int hl_ctx_put(struct hl_ctx *ctx)
return kref_put(&ctx->refcount, hl_ctx_do_release); return kref_put(&ctx->refcount, hl_ctx_do_release);
} }
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
{ {
struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop; struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
struct dma_fence *fence; struct hl_fence *fence;
spin_lock(&ctx->cs_lock); spin_lock(&ctx->cs_lock);
...@@ -201,8 +201,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) ...@@ -201,8 +201,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
return NULL; return NULL;
} }
fence = dma_fence_get( fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]); hl_fence_get(fence);
spin_unlock(&ctx->cs_lock); spin_unlock(&ctx->cs_lock);
return fence; return fence;
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
#include <linux/cdev.h> #include <linux/cdev.h>
#include <linux/iopoll.h> #include <linux/iopoll.h>
#include <linux/irqreturn.h> #include <linux/irqreturn.h>
#include <linux/dma-fence.h>
#include <linux/dma-direction.h> #include <linux/dma-direction.h>
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/hashtable.h> #include <linux/hashtable.h>
...@@ -342,9 +341,22 @@ struct asic_fixed_properties { ...@@ -342,9 +341,22 @@ struct asic_fixed_properties {
u8 completion_queues_count; u8 completion_queues_count;
}; };
/**
* struct hl_fence - software synchronization primitive
* @completion: fence is implemented using completion
* @refcount: refcount for this fence
* @error: mark this fence with error
*
*/
struct hl_fence {
struct completion completion;
struct kref refcount;
int error;
};
/** /**
* struct hl_cs_compl - command submission completion object. * struct hl_cs_compl - command submission completion object.
* @base_fence: kernel fence object. * @base_fence: hl fence object.
* @lock: spinlock to protect fence. * @lock: spinlock to protect fence.
* @hdev: habanalabs device structure. * @hdev: habanalabs device structure.
* @hw_sob: the H/W SOB used in this signal/wait CS. * @hw_sob: the H/W SOB used in this signal/wait CS.
...@@ -353,7 +365,7 @@ struct asic_fixed_properties { ...@@ -353,7 +365,7 @@ struct asic_fixed_properties {
* @sob_val: the SOB value that is used in this signal/wait CS. * @sob_val: the SOB value that is used in this signal/wait CS.
*/ */
struct hl_cs_compl { struct hl_cs_compl {
struct dma_fence base_fence; struct hl_fence base_fence;
spinlock_t lock; spinlock_t lock;
struct hl_device *hdev; struct hl_device *hdev;
struct hl_hw_sob *hw_sob; struct hl_hw_sob *hw_sob;
...@@ -800,7 +812,7 @@ struct hl_va_range { ...@@ -800,7 +812,7 @@ struct hl_va_range {
* @hdev: pointer to the device structure. * @hdev: pointer to the device structure.
* @refcount: reference counter for the context. Context is released only when * @refcount: reference counter for the context. Context is released only when
* this hits 0l. It is incremented on CS and CS_WAIT. * this hits 0l. It is incremented on CS and CS_WAIT.
* @cs_pending: array of DMA fence objects representing pending CS. * @cs_pending: array of hl fence objects representing pending CS.
* @host_va_range: holds available virtual addresses for host mappings. * @host_va_range: holds available virtual addresses for host mappings.
* @host_huge_va_range: holds available virtual addresses for host mappings * @host_huge_va_range: holds available virtual addresses for host mappings
* with huge pages. * with huge pages.
...@@ -832,7 +844,7 @@ struct hl_ctx { ...@@ -832,7 +844,7 @@ struct hl_ctx {
struct hl_fpriv *hpriv; struct hl_fpriv *hpriv;
struct hl_device *hdev; struct hl_device *hdev;
struct kref refcount; struct kref refcount;
struct dma_fence **cs_pending; struct hl_fence **cs_pending;
struct hl_va_range *host_va_range; struct hl_va_range *host_va_range;
struct hl_va_range *host_huge_va_range; struct hl_va_range *host_huge_va_range;
struct hl_va_range *dram_va_range; struct hl_va_range *dram_va_range;
...@@ -919,8 +931,8 @@ struct hl_cs { ...@@ -919,8 +931,8 @@ struct hl_cs {
struct list_head job_list; struct list_head job_list;
spinlock_t job_lock; spinlock_t job_lock;
struct kref refcount; struct kref refcount;
struct dma_fence *fence; struct hl_fence *fence;
struct dma_fence *signal_fence; struct hl_fence *signal_fence;
struct work_struct finish_work; struct work_struct finish_work;
struct delayed_work work_tdr; struct delayed_work work_tdr;
struct list_head mirror_node; struct list_head mirror_node;
...@@ -1739,7 +1751,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx); ...@@ -1739,7 +1751,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
void hl_ctx_do_release(struct kref *ref); void hl_ctx_do_release(struct kref *ref);
void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx); void hl_ctx_get(struct hl_device *hdev, struct hl_ctx *ctx);
int hl_ctx_put(struct hl_ctx *ctx); int hl_ctx_put(struct hl_ctx *ctx);
struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq); struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr); void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr); void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
...@@ -1781,6 +1793,8 @@ void hl_cs_rollback_all(struct hl_device *hdev); ...@@ -1781,6 +1793,8 @@ void hl_cs_rollback_all(struct hl_device *hdev);
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
enum hl_queue_type queue_type, bool is_kernel_allocated_cb); enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
void hl_sob_reset_error(struct kref *ref); void hl_sob_reset_error(struct kref *ref);
void hl_fence_put(struct hl_fence *fence);
void hl_fence_get(struct hl_fence *fence);
void goya_set_asic_funcs(struct hl_device *hdev); void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev); void gaudi_set_asic_funcs(struct hl_device *hdev);
......
...@@ -474,7 +474,7 @@ static void init_signal_wait_cs(struct hl_cs *cs) ...@@ -474,7 +474,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
* wait CS was submitted. * wait CS was submitted.
*/ */
mb(); mb();
dma_fence_put(cs->signal_fence); hl_fence_put(cs->signal_fence);
cs->signal_fence = NULL; cs->signal_fence = NULL;
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment