Commit daed3e44 authored by Lionel Landwerlin's avatar Lionel Landwerlin Committed by Chris Wilson

drm/i915/perf: implement active wait for noa configurations

NOA configuration take some amount of time to apply. That amount of
time depends on the size of the GT. There is no documented time for
this. For example, past experimentations with powergating
configuration changes seem to indicate a 60~70us delay. We go with
500us as default for now which should be over the required amount of
time (according to HW architects).

v2: Don't forget to save/restore registers used for the wait (Chris)

v3: Name used CS_GPR registers (Chris)
    Fix compile issue due to rebase (Lionel)

v4: Fix save/restore helpers (Umesh)

v5: Move noa_wait from drm_i915_private to i915_perf_stream (Lionel)

v6: Add missing struct declarations in i915_perf.h
Signed-off-by: default avatarLionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20191012072308.30312-2-chris@chris-wilson.co.uk
parent 6a45008a
......@@ -163,7 +163,8 @@
#define MI_BATCH_BUFFER_START MI_INSTR(0x31, 0)
#define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
#define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
#define MI_BATCH_RESOURCE_STREAMER (1<<10)
#define MI_BATCH_RESOURCE_STREAMER REG_BIT(10)
#define MI_BATCH_PREDICATE REG_BIT(15) /* HSW+ on RCS only*/
/*
* 3D instructions used by the kernel
......@@ -224,6 +225,7 @@
#define PIPE_CONTROL_CS_STALL (1<<20)
#define PIPE_CONTROL_TLB_INVALIDATE (1<<18)
#define PIPE_CONTROL_MEDIA_STATE_CLEAR (1<<16)
#define PIPE_CONTROL_WRITE_TIMESTAMP (3<<14)
#define PIPE_CONTROL_QW_WRITE (1<<14)
#define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14)
#define PIPE_CONTROL_DEPTH_STALL (1<<13)
......
......@@ -109,6 +109,11 @@ enum intel_gt_scratch_field {
/* 8 bytes */
INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA = 256,
/* 6 * 8 bytes */
INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR = 2048,
/* 4 bytes */
INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1 = 2096,
};
#endif /* __INTEL_GT_TYPES_H__ */
......@@ -3590,6 +3590,37 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
static int
i915_perf_noa_delay_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
const u32 clk = RUNTIME_INFO(i915)->cs_timestamp_frequency_khz;
/*
* This would lead to infinite waits as we're doing timestamp
* difference on the CS with only 32bits.
*/
if (val > mul_u32_u32(U32_MAX, clk))
return -EINVAL;
atomic64_set(&i915->perf.noa_programming_delay, val);
return 0;
}
static int
i915_perf_noa_delay_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
*val = atomic64_read(&i915->perf.noa_programming_delay);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
i915_perf_noa_delay_get,
i915_perf_noa_delay_set,
"%llu\n");
#define DROP_UNBOUND BIT(0)
#define DROP_BOUND BIT(1)
#define DROP_RETIRE BIT(2)
......@@ -4345,6 +4376,7 @@ static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
} i915_debugfs_files[] = {
{"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
{"i915_wedged", &i915_wedged_fops},
{"i915_cache_sharing", &i915_cache_sharing_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
......
......@@ -198,6 +198,7 @@
#include "gem/i915_gem_context.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
#include "gt/intel_lrc_reg.h"
#include "i915_drv.h"
......@@ -1347,6 +1348,12 @@ free_oa_configs(struct i915_perf_stream *stream)
free_oa_config_bo(oa_bo);
}
static void
free_noa_wait(struct i915_perf_stream *stream)
{
i915_vma_unpin_and_release(&stream->noa_wait, 0);
}
static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
{
struct i915_perf *perf = stream->perf;
......@@ -1369,6 +1376,7 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
oa_put_render_ctx_id(stream);
free_oa_configs(stream);
free_noa_wait(stream);
if (perf->spurious_report_rs.missed) {
DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
......@@ -1529,6 +1537,206 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream)
return ret;
}
static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs,
bool save, i915_reg_t reg, u32 offset,
u32 dword_count)
{
u32 cmd;
u32 d;
cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM;
if (INTEL_GEN(stream->perf->i915) >= 8)
cmd++;
for (d = 0; d < dword_count; d++) {
*cs++ = cmd;
*cs++ = i915_mmio_reg_offset(reg) + 4 * d;
*cs++ = intel_gt_scratch_offset(stream->engine->gt,
offset) + 4 * d;
*cs++ = 0;
}
return cs;
}
static int alloc_noa_wait(struct i915_perf_stream *stream)
{
struct drm_i915_private *i915 = stream->perf->i915;
struct drm_i915_gem_object *bo;
struct i915_vma *vma;
const u64 delay_ticks = 0xffffffffffffffff -
DIV64_U64_ROUND_UP(
atomic64_read(&stream->perf->noa_programming_delay) *
RUNTIME_INFO(i915)->cs_timestamp_frequency_khz,
1000000ull);
const u32 base = stream->engine->mmio_base;
#define CS_GPR(x) GEN8_RING_CS_GPR(base, x)
u32 *batch, *ts0, *cs, *jump;
int ret, i;
enum {
START_TS,
NOW_TS,
DELTA_TS,
JUMP_PREDICATE,
DELTA_TARGET,
N_CS_GPR
};
bo = i915_gem_object_create_internal(i915, 4096);
if (IS_ERR(bo)) {
DRM_ERROR("Failed to allocate NOA wait batchbuffer\n");
return PTR_ERR(bo);
}
/*
* We pin in GGTT because we jump into this buffer now because
* multiple OA config BOs will have a jump to this address and it
* needs to be fixed during the lifetime of the i915/perf stream.
*/
vma = i915_gem_object_ggtt_pin(bo, NULL, 0, 0, PIN_HIGH);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto err_unref;
}
batch = cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
if (IS_ERR(batch)) {
ret = PTR_ERR(batch);
goto err_unpin;
}
/* Save registers. */
for (i = 0; i < N_CS_GPR; i++)
cs = save_restore_register(
stream, cs, true /* save */, CS_GPR(i),
INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
cs = save_restore_register(
stream, cs, true /* save */, MI_PREDICATE_RESULT_1,
INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
/* First timestamp snapshot location. */
ts0 = cs;
/*
* Initial snapshot of the timestamp register to implement the wait.
* We work with 32b values, so clear out the top 32b bits of the
* register because the ALU works 64bits.
*/
*cs++ = MI_LOAD_REGISTER_IMM(1);
*cs++ = i915_mmio_reg_offset(CS_GPR(START_TS)) + 4;
*cs++ = 0;
*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
*cs++ = i915_mmio_reg_offset(CS_GPR(START_TS));
/*
* This is the location we're going to jump back into until the
* required amount of time has passed.
*/
jump = cs;
/*
* Take another snapshot of the timestamp register. Take care to clear
* up the top 32bits of CS_GPR(1) as we're using it for other
* operations below.
*/
*cs++ = MI_LOAD_REGISTER_IMM(1);
*cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS)) + 4;
*cs++ = 0;
*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(base));
*cs++ = i915_mmio_reg_offset(CS_GPR(NOW_TS));
/*
* Do a diff between the 2 timestamps and store the result back into
* CS_GPR(1).
*/
*cs++ = MI_MATH(5);
*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
*cs++ = MI_MATH_SUB;
*cs++ = MI_MATH_STORE(MI_MATH_REG(DELTA_TS), MI_MATH_REG_ACCU);
*cs++ = MI_MATH_STORE(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
/*
* Transfer the carry flag (set to 1 if ts1 < ts0, meaning the
* timestamp have rolled over the 32bits) into the predicate register
* to be used for the predicated jump.
*/
*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
*cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
*cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
/* Restart from the beginning if we had timestamps roll over. */
*cs++ = (INTEL_GEN(i915) < 8 ?
MI_BATCH_BUFFER_START :
MI_BATCH_BUFFER_START_GEN8) |
MI_BATCH_PREDICATE;
*cs++ = i915_ggtt_offset(vma) + (ts0 - batch) * 4;
*cs++ = 0;
/*
* Now add the diff between to previous timestamps and add it to :
* (((1 * << 64) - 1) - delay_ns)
*
* When the Carry Flag contains 1 this means the elapsed time is
* longer than the expected delay, and we can exit the wait loop.
*/
*cs++ = MI_LOAD_REGISTER_IMM(2);
*cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET));
*cs++ = lower_32_bits(delay_ticks);
*cs++ = i915_mmio_reg_offset(CS_GPR(DELTA_TARGET)) + 4;
*cs++ = upper_32_bits(delay_ticks);
*cs++ = MI_MATH(4);
*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(DELTA_TS));
*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(DELTA_TARGET));
*cs++ = MI_MATH_ADD;
*cs++ = MI_MATH_STOREINV(MI_MATH_REG(JUMP_PREDICATE), MI_MATH_REG_CF);
/*
* Transfer the result into the predicate register to be used for the
* predicated jump.
*/
*cs++ = MI_LOAD_REGISTER_REG | (3 - 2);
*cs++ = i915_mmio_reg_offset(CS_GPR(JUMP_PREDICATE));
*cs++ = i915_mmio_reg_offset(MI_PREDICATE_RESULT_1);
/* Predicate the jump. */
*cs++ = (INTEL_GEN(i915) < 8 ?
MI_BATCH_BUFFER_START :
MI_BATCH_BUFFER_START_GEN8) |
MI_BATCH_PREDICATE;
*cs++ = i915_ggtt_offset(vma) + (jump - batch) * 4;
*cs++ = 0;
/* Restore registers. */
for (i = 0; i < N_CS_GPR; i++)
cs = save_restore_register(
stream, cs, false /* restore */, CS_GPR(i),
INTEL_GT_SCRATCH_FIELD_PERF_CS_GPR + 8 * i, 2);
cs = save_restore_register(
stream, cs, false /* restore */, MI_PREDICATE_RESULT_1,
INTEL_GT_SCRATCH_FIELD_PERF_PREDICATE_RESULT_1, 1);
/* And return to the ring. */
*cs++ = MI_BATCH_BUFFER_END;
GEM_BUG_ON(cs - batch > PAGE_SIZE / sizeof(*batch));
i915_gem_object_flush_map(bo);
i915_gem_object_unpin_map(bo);
stream->noa_wait = vma;
return 0;
err_unpin:
__i915_vma_unpin(vma);
err_unref:
i915_gem_object_put(bo);
return ret;
}
static void config_oa_regs(struct intel_uncore *uncore,
const struct i915_oa_reg *regs,
u32 n_regs)
......@@ -2206,6 +2414,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
}
}
ret = alloc_noa_wait(stream);
if (ret) {
DRM_DEBUG("Unable to allocate NOA wait batch buffer\n");
goto err_noa_wait_alloc;
}
stream->oa_config = i915_perf_get_oa_config(perf, props->metrics_set);
if (!stream->oa_config) {
DRM_DEBUG("Invalid OA config id=%i\n", props->metrics_set);
......@@ -2265,6 +2479,9 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
intel_engine_pm_put(stream->engine);
err_config:
free_noa_wait(stream);
err_noa_wait_alloc:
if (stream->ctx)
oa_put_render_ctx_id(stream);
......@@ -3651,6 +3868,9 @@ void i915_perf_init(struct drm_i915_private *i915)
ratelimit_set_flags(&perf->spurious_report_rs,
RATELIMIT_MSG_ON_RELEASE);
atomic64_set(&perf->noa_programming_delay,
500 * 1000 /* 500us */);
perf->i915 = i915;
}
}
......@@ -3680,3 +3900,7 @@ void i915_perf_fini(struct drm_i915_private *i915)
memset(&perf->ops, 0, sizeof(perf->ops));
perf->i915 = NULL;
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/i915_perf.c"
#endif
......@@ -266,6 +266,12 @@ struct i915_perf_stream {
*/
u32 head;
} oa_buffer;
/**
* A batch buffer doing a wait on the GPU for the NOA logic to be
* reprogrammed.
*/
struct i915_vma *noa_wait;
};
/**
......@@ -385,6 +391,8 @@ struct i915_perf {
struct i915_oa_ops ops;
const struct i915_oa_format *oa_formats;
atomic64_t noa_programming_delay;
};
#endif /* _I915_PERF_TYPES_H_ */
......@@ -545,7 +545,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define MI_PREDICATE_SRC0_UDW _MMIO(0x2400 + 4)
#define MI_PREDICATE_SRC1 _MMIO(0x2408)
#define MI_PREDICATE_SRC1_UDW _MMIO(0x2408 + 4)
#define MI_PREDICATE_DATA _MMIO(0x2410)
#define MI_PREDICATE_RESULT _MMIO(0x2418)
#define MI_PREDICATE_RESULT_1 _MMIO(0x241c)
#define MI_PREDICATE_RESULT_2 _MMIO(0x2214)
#define LOWER_SLICE_ENABLED (1 << 0)
#define LOWER_SLICE_DISABLED (0 << 0)
......
......@@ -35,3 +35,4 @@ selftest(reset, intel_reset_live_selftests)
selftest(hangcheck, intel_hangcheck_live_selftests)
selftest(execlists, intel_execlists_live_selftests)
selftest(guc, intel_guc_live_selftest)
selftest(perf, i915_perf_live_selftests)
/*
* SPDX-License-Identifier: MIT
*
* Copyright © 2019 Intel Corporation
*/
#include <linux/kref.h>
#include "gem/i915_gem_pm.h"
#include "gt/intel_gt.h"
#include "i915_selftest.h"
#include "igt_flush_test.h"
#include "lib_sw_fence.h"
static struct i915_perf_stream *
test_stream(struct i915_perf *perf)
{
struct drm_i915_perf_open_param param = {};
struct perf_open_properties props = {
.engine = intel_engine_lookup_user(perf->i915,
I915_ENGINE_CLASS_RENDER,
0),
.sample_flags = SAMPLE_OA_REPORT,
.oa_format = I915_OA_FORMAT_C4_B8,
.metrics_set = 1,
};
struct i915_perf_stream *stream;
stream = kzalloc(sizeof(*stream), GFP_KERNEL);
if (!stream)
return NULL;
stream->perf = perf;
mutex_lock(&perf->lock);
if (i915_oa_stream_init(stream, &param, &props)) {
kfree(stream);
stream = NULL;
}
mutex_unlock(&perf->lock);
return stream;
}
static void stream_destroy(struct i915_perf_stream *stream)
{
struct i915_perf *perf = stream->perf;
mutex_lock(&perf->lock);
i915_perf_destroy_locked(stream);
mutex_unlock(&perf->lock);
}
static int live_sanitycheck(void *arg)
{
struct drm_i915_private *i915 = arg;
struct i915_perf_stream *stream;
/* Quick check we can create a perf stream */
stream = test_stream(&i915->perf);
if (!stream)
return -EINVAL;
stream_destroy(stream);
return 0;
}
static int write_timestamp(struct i915_request *rq, int slot)
{
u32 *cs;
int len;
cs = intel_ring_begin(rq, 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
len = 5;
if (INTEL_GEN(rq->i915) >= 8)
len++;
*cs++ = GFX_OP_PIPE_CONTROL(len);
*cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB |
PIPE_CONTROL_STORE_DATA_INDEX |
PIPE_CONTROL_WRITE_TIMESTAMP;
*cs++ = slot * sizeof(u32);
*cs++ = 0;
*cs++ = 0;
*cs++ = 0;
intel_ring_advance(rq, cs);
return 0;
}
static ktime_t poll_status(struct i915_request *rq, int slot)
{
while (!intel_read_status_page(rq->engine, slot) &&
!i915_request_completed(rq))
cpu_relax();
return ktime_get();
}
static int live_noa_delay(void *arg)
{
struct drm_i915_private *i915 = arg;
struct i915_perf_stream *stream;
struct i915_request *rq;
ktime_t t0, t1;
u64 expected;
u32 delay;
int err;
int i;
/* Check that the GPU delays matches expectations */
stream = test_stream(&i915->perf);
if (!stream)
return -ENOMEM;
expected = atomic64_read(&stream->perf->noa_programming_delay);
if (stream->engine->class != RENDER_CLASS) {
err = -ENODEV;
goto out;
}
for (i = 0; i < 4; i++)
intel_write_status_page(stream->engine, 0x100 + i, 0);
rq = i915_request_create(stream->engine->kernel_context);
if (IS_ERR(rq)) {
err = PTR_ERR(rq);
goto out;
}
if (rq->engine->emit_init_breadcrumb &&
i915_request_timeline(rq)->has_initial_breadcrumb) {
err = rq->engine->emit_init_breadcrumb(rq);
if (err) {
i915_request_add(rq);
goto out;
}
}
err = write_timestamp(rq, 0x100);
if (err) {
i915_request_add(rq);
goto out;
}
err = rq->engine->emit_bb_start(rq,
i915_ggtt_offset(stream->noa_wait), 0,
I915_DISPATCH_SECURE);
if (err) {
i915_request_add(rq);
goto out;
}
err = write_timestamp(rq, 0x102);
if (err) {
i915_request_add(rq);
goto out;
}
i915_request_get(rq);
i915_request_add(rq);
preempt_disable();
t0 = poll_status(rq, 0x100);
t1 = poll_status(rq, 0x102);
preempt_enable();
pr_info("CPU delay: %lluns, expected %lluns\n",
ktime_sub(t1, t0), expected);
delay = intel_read_status_page(stream->engine, 0x102);
delay -= intel_read_status_page(stream->engine, 0x100);
delay = div_u64(mul_u32_u32(delay, 1000 * 1000),
RUNTIME_INFO(i915)->cs_timestamp_frequency_khz);
pr_info("GPU delay: %uns, expected %lluns\n",
delay, expected);
if (4 * delay < 3 * expected || 2 * delay > 3 * expected) {
pr_err("GPU delay [%uus] outside of expected threshold! [%lluus, %lluus]\n",
delay / 1000,
div_u64(3 * expected, 4000),
div_u64(3 * expected, 2000));
err = -EINVAL;
}
i915_request_put(rq);
out:
stream_destroy(stream);
return err;
}
int i915_perf_live_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
SUBTEST(live_sanitycheck),
SUBTEST(live_noa_delay),
};
struct i915_perf *perf = &i915->perf;
if (!perf->metrics_kobj || !perf->ops.enable_metric_set)
return 0;
if (intel_gt_is_wedged(&i915->gt))
return 0;
return i915_subtests(tests, i915);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment