Commit 2e4c6c1a authored by Chris Wilson's avatar Chris Wilson Committed by Joonas Lahtinen

drm/i915: Remove i915_request.lock requirement for execution callbacks

To implement preempt-to-busy (and so efficient timeslicing and best utilization
of the hardware submission ports) we let the GPU run asynchronously in respect
to the ELSP submission queue. This created challenges in keeping and accessing
the driver state mirroring the asynchronous GPU execution.

Previous fix 1d9221e9 ("drm/i915: Skip signaling a signaled request")
however did not correctly serialize request retirement with the execution
callbacks.

We were using the i915_request.lock to serialise adding an execution callback
with __i915_request_submit. However, if we use an atomic llist_add to serialise
multiple waiters and then check to see if the request is already executing, we
can remove the irq-spinlock and fix serialization between retirement and
execution callbacks in one go.

v2: Avoid using the irq_work when outside of the irq-spinlocks, where we
can execute the callbacks immediately.
v3: Pay close attention to the order of setting ACTIVE on retirement, we
need to ensure the request is signaled and breadcrumbs detached before
we finish removing the request from the engine.
v4: Expanded commit message.

Fixes: 1d9221e9 ("drm/i915: Skip signaling a signaled request")
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200716142207.13003-2-chris@chris-wilson.co.ukSigned-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
[Joonas: Rebased and reordered into drm-intel-gt-next branch]
[Joonas: Added expanded commit message from Tvrtko and Chris]
Signed-off-by: default avatarJoonas Lahtinen <joonas.lahtinen@linux.intel.com>
parent b4d9145b
...@@ -187,30 +187,34 @@ static void irq_execute_cb_hook(struct irq_work *wrk) ...@@ -187,30 +187,34 @@ static void irq_execute_cb_hook(struct irq_work *wrk)
irq_execute_cb(wrk); irq_execute_cb(wrk);
} }
static void __notify_execute_cb(struct i915_request *rq) static __always_inline void
__notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk))
{ {
struct execute_cb *cb, *cn; struct execute_cb *cb, *cn;
lockdep_assert_held(&rq->lock);
GEM_BUG_ON(!i915_request_is_active(rq));
if (llist_empty(&rq->execute_cb)) if (llist_empty(&rq->execute_cb))
return; return;
llist_for_each_entry_safe(cb, cn, rq->execute_cb.first, work.llnode) llist_for_each_entry_safe(cb, cn,
irq_work_queue(&cb->work); llist_del_all(&rq->execute_cb),
work.llnode)
fn(&cb->work);
}
/* static void __notify_execute_cb_irq(struct i915_request *rq)
* XXX Rollback on __i915_request_unsubmit() {
* __notify_execute_cb(rq, irq_work_queue);
* In the future, perhaps when we have an active time-slicing scheduler, }
* it will be interesting to unsubmit parallel execution and remove
* busywaits from the GPU until their master is restarted. This is static bool irq_work_imm(struct irq_work *wrk)
* quite hairy, we have to carefully rollback the fence and do a {
* preempt-to-idle cycle on the target engine, all the while the wrk->func(wrk);
* master execute_cb may refire. return false;
*/ }
init_llist_head(&rq->execute_cb);
static void __notify_execute_cb_imm(struct i915_request *rq)
{
__notify_execute_cb(rq, irq_work_imm);
} }
static void free_capture_list(struct i915_request *request) static void free_capture_list(struct i915_request *request)
...@@ -257,9 +261,16 @@ static void remove_from_engine(struct i915_request *rq) ...@@ -257,9 +261,16 @@ static void remove_from_engine(struct i915_request *rq)
locked = engine; locked = engine;
} }
list_del_init(&rq->sched.link); list_del_init(&rq->sched.link);
clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags); clear_bit(I915_FENCE_FLAG_HOLD, &rq->fence.flags);
/* Prevent further __await_execution() registering a cb, then flush */
set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
spin_unlock_irq(&locked->active.lock); spin_unlock_irq(&locked->active.lock);
__notify_execute_cb_imm(rq);
} }
bool i915_request_retire(struct i915_request *rq) bool i915_request_retire(struct i915_request *rq)
...@@ -271,6 +282,7 @@ bool i915_request_retire(struct i915_request *rq) ...@@ -271,6 +282,7 @@ bool i915_request_retire(struct i915_request *rq)
GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit)); GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
trace_i915_request_retire(rq); trace_i915_request_retire(rq);
i915_request_mark_complete(rq);
/* /*
* We know the GPU must have read the request to have * We know the GPU must have read the request to have
...@@ -288,15 +300,6 @@ bool i915_request_retire(struct i915_request *rq) ...@@ -288,15 +300,6 @@ bool i915_request_retire(struct i915_request *rq)
__i915_request_fill(rq, POISON_FREE); __i915_request_fill(rq, POISON_FREE);
rq->ring->head = rq->postfix; rq->ring->head = rq->postfix;
/*
* We only loosely track inflight requests across preemption,
* and so we may find ourselves attempting to retire a _completed_
* request that we have removed from the HW and put back on a run
* queue.
*/
remove_from_engine(rq);
i915_request_mark_complete(rq);
if (!i915_request_signaled(rq)) { if (!i915_request_signaled(rq)) {
spin_lock_irq(&rq->lock); spin_lock_irq(&rq->lock);
dma_fence_signal_locked(&rq->fence); dma_fence_signal_locked(&rq->fence);
...@@ -320,7 +323,6 @@ bool i915_request_retire(struct i915_request *rq) ...@@ -320,7 +323,6 @@ bool i915_request_retire(struct i915_request *rq)
*/ */
remove_from_engine(rq); remove_from_engine(rq);
GEM_BUG_ON(!llist_empty(&rq->execute_cb)); GEM_BUG_ON(!llist_empty(&rq->execute_cb));
spin_unlock_irq(&rq->lock);
__list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */ __list_del_entry(&rq->link); /* poison neither prev/next (RCU walks) */
...@@ -348,12 +350,6 @@ void i915_request_retire_upto(struct i915_request *rq) ...@@ -348,12 +350,6 @@ void i915_request_retire_upto(struct i915_request *rq)
} while (i915_request_retire(tmp) && tmp != rq); } while (i915_request_retire(tmp) && tmp != rq);
} }
static void __llist_add(struct llist_node *node, struct llist_head *head)
{
node->next = head->first;
head->first = node;
}
static struct i915_request * const * static struct i915_request * const *
__engine_active(struct intel_engine_cs *engine) __engine_active(struct intel_engine_cs *engine)
{ {
...@@ -451,18 +447,24 @@ __await_execution(struct i915_request *rq, ...@@ -451,18 +447,24 @@ __await_execution(struct i915_request *rq,
cb->work.func = irq_execute_cb_hook; cb->work.func = irq_execute_cb_hook;
} }
spin_lock_irq(&signal->lock); /*
if (i915_request_is_active(signal) || __request_in_flight(signal)) { * Register the callback first, then see if the signaler is already
if (hook) { * active. This ensures that if we race with the
hook(rq, &signal->fence); * __notify_execute_cb from i915_request_submit() and we are not
i915_request_put(signal); * included in that list, we get a second bite of the cherry and
} * execute it ourselves. After this point, a future
i915_sw_fence_complete(cb->fence); * i915_request_submit() will notify us.
kmem_cache_free(global.slab_execute_cbs, cb); *
} else { * In i915_request_retire() we set the ACTIVE bit on a completed
__llist_add(&cb->work.llnode, &signal->execute_cb); * request (then flush the execute_cb). So by registering the
* callback first, then checking the ACTIVE bit, we serialise with
* the completed/retired request.
*/
if (llist_add(&cb->work.llnode, &signal->execute_cb)) {
if (i915_request_is_active(signal) ||
__request_in_flight(signal))
__notify_execute_cb_imm(signal);
} }
spin_unlock_irq(&signal->lock);
return 0; return 0;
} }
...@@ -588,10 +590,19 @@ bool __i915_request_submit(struct i915_request *request) ...@@ -588,10 +590,19 @@ bool __i915_request_submit(struct i915_request *request)
* preempt-to-idle cycle on the target engine, all the while the * preempt-to-idle cycle on the target engine, all the while the
* master execute_cb may refire. * master execute_cb may refire.
*/ */
__notify_execute_cb(request); __notify_execute_cb_irq(request);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags)) /* We may be recursing from the signal callback of another i915 fence */
i915_request_enable_breadcrumb(request); if (!i915_request_signaled(request)) {
spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
&request->fence.flags) &&
!i915_request_enable_breadcrumb(request))
intel_engine_signal_breadcrumbs(engine);
spin_unlock(&request->lock);
}
return result; return result;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment