Commit 24e6ba7f authored by Grzegorz Bizon's avatar Grzegorz Bizon

Merge branch '283807-failure-reason-dashboard' into 'master'

Add metrics to track failure reasons of pipelines and jobs

See merge request gitlab-org/gitlab!57232
parents e71f0442 b1490d69
......@@ -286,9 +286,11 @@ module Ci
end
after_transition any => [:failed] do |pipeline|
next unless pipeline.auto_devops_source?
pipeline.run_after_commit do
::Gitlab::Ci::Pipeline::Metrics.pipeline_failure_reason_counter.increment(reason: pipeline.failure_reason)
pipeline.run_after_commit { AutoDevops::DisableWorker.perform_async(pipeline.id) }
AutoDevops::DisableWorker.perform_async(pipeline.id) if pipeline.auto_devops_source?
end
end
end
......
......@@ -181,15 +181,16 @@ class CommitStatus < ApplicationRecord
end
after_transition any => :failed do |commit_status|
next if Feature.enabled?(:async_add_build_failure_todo, commit_status.project, default_enabled: :yaml)
next unless commit_status.project
# rubocop: disable CodeReuse/ServiceClass
commit_status.run_after_commit do
MergeRequests::AddTodoWhenBuildFailsService
.new(project, nil).execute(self)
::Gitlab::Ci::Pipeline::Metrics.job_failure_reason_counter.increment(reason: commit_status.failure_reason)
# rubocop: disable CodeReuse/ServiceClass
next if Feature.enabled?(:async_add_build_failure_todo, commit_status.project, default_enabled: :yaml)
next unless commit_status.project
MergeRequests::AddTodoWhenBuildFailsService.new(project, nil).execute(self)
# rubocop: enable CodeReuse/ServiceClass
end
# rubocop: enable CodeReuse/ServiceClass
end
end
......
......@@ -19,7 +19,7 @@ module Ci
end
def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end
private
......
......@@ -83,7 +83,8 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
project: project,
current_user: user,
save_incompleted: false,
pipeline_seed: double(:seed, size: 2))
pipeline_seed: double(:seed, size: 2),
increment_pipeline_failure_reason_counter: true)
end
it 'does not drop the pipeline' do
......@@ -97,6 +98,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
expect(step.break?).to be true
end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:size_limit_exceeded)
subject
end
end
end
......
......@@ -84,7 +84,7 @@ module Gitlab
end
def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end
def observe_creation_duration(duration)
......@@ -97,6 +97,11 @@ module Gitlab
.observe({ source: pipeline.source.to_s }, pipeline.total_size)
end
def increment_pipeline_failure_reason_counter(reason)
metrics.pipeline_failure_reason_counter
.increment(reason: (reason || :unknown_failure).to_s)
end
def dangling_build?
%i[ondemand_dast_scan webide].include?(source)
end
......
......@@ -12,7 +12,8 @@ module Gitlab
end
pipeline.add_error_message(message)
pipeline.drop!(drop_reason) if drop_reason && persist_pipeline?
drop_pipeline!(drop_reason)
# TODO: consider not to rely on AR errors directly as they can be
# polluted with other unrelated errors (e.g. state machine)
......@@ -24,8 +25,16 @@ module Gitlab
pipeline.add_warning_message(message)
end
def persist_pipeline?
command.save_incompleted && !pipeline.readonly?
private
def drop_pipeline!(drop_reason)
return if pipeline.readonly?
if drop_reason && command.save_incompleted
pipeline.drop!(drop_reason)
else
command.increment_pipeline_failure_reason_counter(drop_reason)
end
end
end
end
......
......@@ -14,7 +14,7 @@ module Gitlab
end
def counter
::Gitlab::Ci::Pipeline::Metrics.new.pipelines_created_counter
::Gitlab::Ci::Pipeline::Metrics.pipelines_created_counter
end
end
end
......
......@@ -4,9 +4,9 @@ module Gitlab
module Ci
module Pipeline
class Metrics
include Gitlab::Utils::StrongMemoize
extend Gitlab::Utils::StrongMemoize
def pipeline_creation_duration_histogram
def self.pipeline_creation_duration_histogram
strong_memoize(:pipeline_creation_duration_histogram) do
name = :gitlab_ci_pipeline_creation_duration_seconds
comment = 'Pipeline creation duration'
......@@ -17,7 +17,7 @@ module Gitlab
end
end
def pipeline_size_histogram
def self.pipeline_size_histogram
strong_memoize(:pipeline_size_histogram) do
name = :gitlab_ci_pipeline_size_builds
comment = 'Pipeline size'
......@@ -28,7 +28,7 @@ module Gitlab
end
end
def pipeline_processing_events_counter
def self.pipeline_processing_events_counter
strong_memoize(:pipeline_processing_events_counter) do
name = :gitlab_ci_pipeline_processing_events_total
comment = 'Total amount of pipeline processing events'
......@@ -37,7 +37,7 @@ module Gitlab
end
end
def pipelines_created_counter
def self.pipelines_created_counter
strong_memoize(:pipelines_created_count) do
name = :pipelines_created_total
comment = 'Counter of pipelines created'
......@@ -46,7 +46,7 @@ module Gitlab
end
end
def legacy_update_jobs_counter
def self.legacy_update_jobs_counter
strong_memoize(:legacy_update_jobs_counter) do
name = :ci_legacy_update_jobs_as_retried_total
comment = 'Counter of occurrences when jobs were not being set as retried before update_retried'
......@@ -54,6 +54,24 @@ module Gitlab
Gitlab::Metrics.counter(name, comment)
end
end
def self.pipeline_failure_reason_counter
strong_memoize(:pipeline_failure_reason_counter) do
name = :gitlab_ci_pipeline_failure_reasons
comment = 'Counter of pipeline failure reasons'
Gitlab::Metrics.counter(name, comment)
end
end
def self.job_failure_reason_counter
strong_memoize(:job_failure_reason_counter) do
name = :gitlab_ci_job_failure_reasons
comment = 'Counter of job failure reasons'
Gitlab::Metrics.counter(name, comment)
end
end
end
end
end
......
......@@ -321,4 +321,25 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Command do
it { is_expected.to be_falsey }
end
end
describe '#increment_pipeline_failure_reason_counter' do
let(:command) { described_class.new }
let(:reason) { :size_limit_exceeded }
subject { command.increment_pipeline_failure_reason_counter(reason) }
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: reason.to_s) }.by(1)
end
context 'when the reason is nil' do
let(:reason) { nil }
it 'increments the error metric with unknown_failure' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end
end
end
......@@ -11,7 +11,7 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
let(:save_incompleted) { false }
let(:command) do
double(:command,
Gitlab::Ci::Pipeline::Chain::Command.new(
project: project,
pipeline_seed: pipeline_seed,
save_incompleted: save_incompleted
......@@ -49,6 +49,11 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.deployments_limit_exceeded?).to be true
end
it 'calls increment_pipeline_failure_reason_counter' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { perform }.to change { counter.get(reason: 'deployments_limit_exceeded') }.by(1)
end
end
context 'when not saving incomplete pipelines' do
......@@ -71,6 +76,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.errors.messages).to include(base: ['Pipeline has too many deployments! Requested 2, but the limit is 1.'])
end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:deployments_limit_exceeded)
perform
end
end
it 'logs the error' do
......
......@@ -96,6 +96,11 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Populate do
it 'wastes pipeline iid' do
expect(InternalId.ci_pipelines.where(project_id: project.id).last.last_value).to be > 0
end
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { run_chain }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end
describe 'pipeline protect' do
......
......@@ -3867,6 +3867,16 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do
pipeline.drop
end
end
context 'with failure_reason' do
let(:pipeline) { create(:ci_pipeline, :running) }
let(:failure_reason) { 'config_error' }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc') }
it 'increments the counter with the failure_reason' do
expect { pipeline.drop!(failure_reason) }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
end
end
......
......@@ -629,30 +629,45 @@ RSpec.describe CommitStatus do
end
end
describe 'set failure_reason when drop' do
describe '#drop' do
let(:commit_status) { create(:commit_status, :created) }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_job_failure_reasons, 'desc') }
let(:failure_reason) { reason.to_s }
subject do
commit_status.drop!(reason)
commit_status
end
shared_examples 'incrementing failure reason counter' do
it 'increments the counter with the failure_reason' do
expect { subject }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
context 'when failure_reason is nil' do
let(:reason) { }
let(:failure_reason) { 'unknown_failure' }
it { is_expected.to be_unknown_failure }
it_behaves_like 'incrementing failure reason counter'
end
context 'when failure_reason is script_failure' do
let(:reason) { :script_failure }
it { is_expected.to be_script_failure }
it_behaves_like 'incrementing failure reason counter'
end
context 'when failure_reason is unmet_prerequisites' do
let(:reason) { :unmet_prerequisites }
it { is_expected.to be_unmet_prerequisites }
it_behaves_like 'incrementing failure reason counter'
end
end
......
......@@ -71,19 +71,21 @@ RSpec.describe Ci::CreatePipelineService do
end
it 'increments the prometheus counter' do
expect(Gitlab::Metrics).to receive(:counter)
.with(:pipelines_created_total, "Counter of pipelines created")
.and_call_original
allow(Gitlab::Metrics).to receive(:counter).and_call_original # allow other counters
counter = spy('pipeline created counter')
allow(Gitlab::Ci::Pipeline::Metrics)
.to receive(:pipelines_created_counter).and_return(counter)
pipeline
expect(counter).to have_received(:increment)
end
it 'records pipeline size in a prometheus histogram' do
histogram = spy('pipeline size histogram')
allow(Gitlab::Ci::Pipeline::Metrics)
.to receive(:new).and_return(histogram)
.to receive(:pipeline_size_histogram).and_return(histogram)
execute_service
......@@ -580,6 +582,13 @@ RSpec.describe Ci::CreatePipelineService do
it_behaves_like 'a failed pipeline'
it 'increments the error metric' do
stub_ci_pipeline_yaml_file(ci_yaml)
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { execute_service }.to change { counter.get(reason: 'config_error') }.by(1)
end
context 'when receive git commit' do
before do
allow_any_instance_of(Ci::Pipeline).to receive(:git_commit_message) { message }
......
......@@ -10,6 +10,14 @@ RSpec.describe Ci::ProcessPipelineService do
create(:ci_empty_pipeline, ref: 'master', project: project)
end
let(:pipeline_processing_events_counter) { double(increment: true) }
let(:legacy_update_jobs_counter) { double(increment: true) }
let(:metrics) do
double(pipeline_processing_events_counter: pipeline_processing_events_counter,
legacy_update_jobs_counter: legacy_update_jobs_counter)
end
subject { described_class.new(pipeline) }
before do
......@@ -17,22 +25,13 @@ RSpec.describe Ci::ProcessPipelineService do
stub_not_protect_default_branch
project.add_developer(user)
allow(subject).to receive(:metrics).and_return(metrics)
end
describe 'processing events counter' do
let(:metrics) { double('pipeline metrics') }
let(:counter) { double('events counter') }
before do
allow(subject)
.to receive(:metrics).and_return(metrics)
allow(metrics)
.to receive(:pipeline_processing_events_counter)
.and_return(counter)
end
it 'increments processing events counter' do
expect(counter).to receive(:increment)
expect(pipeline_processing_events_counter).to receive(:increment)
subject.execute
end
......@@ -64,33 +63,22 @@ RSpec.describe Ci::ProcessPipelineService do
expect(all_builds.retried).to contain_exactly(build_retried)
end
context 'counter ci_legacy_update_jobs_as_retried_total' do
let(:counter) { double(increment: true) }
it 'increments the counter' do
expect(legacy_update_jobs_counter).to receive(:increment)
subject.execute
end
context 'when the previous build has already retried column true' do
before do
allow(Gitlab::Metrics).to receive(:counter).and_call_original
allow(Gitlab::Metrics).to receive(:counter)
.with(:ci_legacy_update_jobs_as_retried_total, anything)
.and_return(counter)
build_retried.update_columns(retried: true)
end
it 'increments the counter' do
expect(counter).to receive(:increment)
it 'does not increment the counter' do
expect(legacy_update_jobs_counter).not_to receive(:increment)
subject.execute
end
context 'when the previous build has already retried column true' do
before do
build_retried.update_columns(retried: true)
end
it 'does not increment the counter' do
expect(counter).not_to receive(:increment)
subject.execute
end
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment