Commit 37f94825 authored by Stan Hu's avatar Stan Hu

Merge branch 'an-sidekiq-records-failure-durations' into 'master'

Record latencies for sidekiq failures

Closes gitlab-com/gl-infra/scalability#46

See merge request gitlab-org/gitlab!18909
parents 23c170f8 c17922ea
---
title: Record latencies for Sidekiq failures
merge_request: 18909
author:
type: performance
...@@ -21,29 +21,35 @@ module Gitlab ...@@ -21,29 +21,35 @@ module Gitlab
@metrics[:sidekiq_jobs_retried_total].increment(labels, 1) @metrics[:sidekiq_jobs_retried_total].increment(labels, 1)
end end
job_succeeded = false
monotonic_time_start = Gitlab::Metrics::System.monotonic_time
job_thread_cputime_start = get_thread_cputime job_thread_cputime_start = get_thread_cputime
begin
realtime = Benchmark.realtime do
yield yield
end job_succeeded = true
ensure
monotonic_time_end = Gitlab::Metrics::System.monotonic_time
job_thread_cputime_end = get_thread_cputime
job_thread_cputime_end = get_thread_cputime monotonic_time = monotonic_time_end - monotonic_time_start
job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start job_thread_cputime = job_thread_cputime_end - job_thread_cputime_start
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, realtime) # sidekiq_running_jobs, sidekiq_jobs_failed_total should not include the job_status label
rescue Exception # rubocop: disable Lint/RescueException @metrics[:sidekiq_running_jobs].increment(labels, -1)
@metrics[:sidekiq_jobs_failed_total].increment(labels, 1) @metrics[:sidekiq_jobs_failed_total].increment(labels, 1) unless job_succeeded
raise
ensure # job_status: done, fail match the job_status attribute in structured logging
@metrics[:sidekiq_running_jobs].increment(labels, -1) labels[:job_status] = job_succeeded ? :done : :fail
@metrics[:sidekiq_jobs_cpu_seconds].observe(labels, job_thread_cputime)
@metrics[:sidekiq_jobs_completion_seconds].observe(labels, monotonic_time)
end
end end
private private
def init_metrics def init_metrics
{ {
sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_cpu_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_cpu_seconds, 'Seconds of cpu time to run sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS), sidekiq_jobs_completion_seconds: ::Gitlab::Metrics.histogram(:sidekiq_jobs_completion_seconds, 'Seconds to complete sidekiq job', {}, SIDEKIQ_LATENCY_BUCKETS),
sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'), sidekiq_jobs_failed_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_failed_total, 'Sidekiq jobs failed'),
sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'), sidekiq_jobs_retried_total: ::Gitlab::Metrics.counter(:sidekiq_jobs_retried_total, 'Sidekiq jobs retried'),
......
...@@ -44,12 +44,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -44,12 +44,14 @@ describe Gitlab::SidekiqMiddleware::Metrics do
it 'sets queue specific metrics' do it 'sets queue specific metrics' do
labels = { queue: :test } labels = { queue: :test }
labels_with_job_status = { queue: :test, job_status: :done }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 3) allow(middleware).to receive(:get_thread_cputime).and_return(1, 3)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 3)
expect(user_execution_seconds_metric).to receive(:observe).with(labels, 2)
expect(running_jobs_metric).to receive(:increment).with(labels, 1) expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1) expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(completion_seconds_metric).to receive(:observe).with(labels, kind_of(Numeric)) expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 2)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 1)
middleware.call(worker, {}, :test) { nil } middleware.call(worker, {}, :test) { nil }
end end
...@@ -74,8 +76,18 @@ describe Gitlab::SidekiqMiddleware::Metrics do ...@@ -74,8 +76,18 @@ describe Gitlab::SidekiqMiddleware::Metrics do
context 'when error is raised' do context 'when error is raised' do
it 'sets sidekiq_jobs_failed_total and reraises' do it 'sets sidekiq_jobs_failed_total and reraises' do
expect(failed_total_metric).to receive(:increment) labels = { queue: :test }
expect { middleware.call(worker, {}, :test) { raise } }.to raise_error labels_with_job_status = { queue: :test, job_status: :fail }
allow(middleware).to receive(:get_thread_cputime).and_return(1, 4)
allow(Gitlab::Metrics::System).to receive(:monotonic_time).and_return(2, 6)
expect(running_jobs_metric).to receive(:increment).with(labels, 1)
expect(running_jobs_metric).to receive(:increment).with(labels, -1)
expect(failed_total_metric).to receive(:increment).with(labels, 1)
expect(user_execution_seconds_metric).to receive(:observe).with(labels_with_job_status, 3)
expect(completion_seconds_metric).to receive(:observe).with(labels_with_job_status, 4)
expect { middleware.call(worker, {}, :test) { raise StandardError, "Failed" } }.to raise_error(StandardError, "Failed")
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment