Commit d37a808e authored by Sean McGivern's avatar Sean McGivern

Report to Sentry for long Redis durations

We're seeing some very high (over 5 seconds) Redis durations in our
logs, and we don't know what's causing them. This is an attempt to
answer that by sending an event to Sentry when we detect this
happening.

It's behind the feature flag `report_on_long_redis_durations`, which is
off by default.

starting
parent 523cb9df
---
name: report_on_long_redis_durations
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/67512
rollout_issue_url: https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1183
milestone: '14.2'
type: development
group: team::Scalability
default_enabled: false
......@@ -5,8 +5,21 @@ module Gitlab
module RedisInterceptor
APDEX_EXCLUDE = %w[brpop blpop brpoplpush bzpopmin bzpopmax xread xreadgroup].freeze
# These are temporary to help with investigating
# https://gitlab.com/gitlab-com/gl-infra/scalability/-/issues/1183
DURATION_ERROR_THRESHOLD = 1.25.seconds
class MysteryRedisDurationError < StandardError
attr_reader :backtrace
def initialize(backtrace)
@backtrace = backtrace
end
end
def call(*args, &block)
start = Gitlab::Metrics::System.monotonic_time # must come first so that 'start' is always defined
start_real_time = Time.now
instrumentation_class.instance_count_request
instrumentation_class.redis_cluster_validate!(args.first)
......@@ -27,6 +40,13 @@ module Gitlab
instrumentation_class.add_duration(duration)
instrumentation_class.add_call_details(duration, args)
end
if duration > DURATION_ERROR_THRESHOLD && Feature.enabled?(:report_on_long_redis_durations, default_enabled: :yaml)
Gitlab::ErrorTracking.track_exception(MysteryRedisDurationError.new(caller),
command: command_from_args(args),
duration: duration,
timestamp: start_real_time.iso8601(5))
end
end
def write(command)
......
......@@ -111,4 +111,35 @@ RSpec.describe Gitlab::Instrumentation::RedisInterceptor, :clean_gitlab_redis_sh
end
end
end
context 'when a command takes longer than DURATION_ERROR_THRESHOLD' do
let(:threshold) { 0.5 }
before do
stub_const("#{described_class}::DURATION_ERROR_THRESHOLD", threshold)
end
context 'when report_on_long_redis_durations is disabled' do
it 'does nothing' do
stub_feature_flags(report_on_long_redis_durations: false)
expect(Gitlab::ErrorTracking).not_to receive(:track_exception)
Gitlab::Redis::SharedState.with { |r| r.mget('foo', 'foo') { sleep threshold + 0.1 } }
end
end
context 'when report_on_long_redis_durations is enabled' do
it 'tracks an exception and continues' do
expect(Gitlab::ErrorTracking)
.to receive(:track_exception)
.with(an_instance_of(described_class::MysteryRedisDurationError),
command: 'mget',
duration: be > threshold,
timestamp: a_string_matching(/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{5}/))
Gitlab::Redis::SharedState.with { |r| r.mget('foo', 'foo') { sleep threshold + 0.1 } }
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment