Commit b1490d69 authored by Furkan Ayhan's avatar Furkan Ayhan Committed by Grzegorz Bizon

Add metrics to track failure reasons of pipelines and jobs

parent d4df8986
...@@ -286,9 +286,11 @@ module Ci ...@@ -286,9 +286,11 @@ module Ci
end end
after_transition any => [:failed] do |pipeline| after_transition any => [:failed] do |pipeline|
next unless pipeline.auto_devops_source? pipeline.run_after_commit do
::Gitlab::Ci::Pipeline::Metrics.pipeline_failure_reason_counter.increment(reason: pipeline.failure_reason)
pipeline.run_after_commit { AutoDevops::DisableWorker.perform_async(pipeline.id) } AutoDevops::DisableWorker.perform_async(pipeline.id) if pipeline.auto_devops_source?
end
end end
end end
......
...@@ -181,15 +181,16 @@ class CommitStatus < ApplicationRecord ...@@ -181,15 +181,16 @@ class CommitStatus < ApplicationRecord
end end
after_transition any => :failed do |commit_status| after_transition any => :failed do |commit_status|
next if Feature.enabled?(:async_add_build_failure_todo, commit_status.project, default_enabled: :yaml)
next unless commit_status.project
# rubocop: disable CodeReuse/ServiceClass
commit_status.run_after_commit do commit_status.run_after_commit do
MergeRequests::AddTodoWhenBuildFailsService ::Gitlab::Ci::Pipeline::Metrics.job_failure_reason_counter.increment(reason: commit_status.failure_reason)
.new(project, nil).execute(self)
# rubocop: disable CodeReuse/ServiceClass
next if Feature.enabled?(:async_add_build_failure_todo, commit_status.project, default_enabled: :yaml)
next unless commit_status.project
MergeRequests::AddTodoWhenBuildFailsService.new(project, nil).execute(self)
# rubocop: enable CodeReuse/ServiceClass
end end
# rubocop: enable CodeReuse/ServiceClass
end end
end end
......
...@@ -19,7 +19,7 @@ module Ci ...@@ -19,7 +19,7 @@ module Ci
end end
def metrics def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new @metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end end
private private
......
...@@ -83,7 +83,8 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do ...@@ -83,7 +83,8 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
project: project, project: project,
current_user: user, current_user: user,
save_incompleted: false, save_incompleted: false,
pipeline_seed: double(:seed, size: 2)) pipeline_seed: double(:seed, size: 2),
increment_pipeline_failure_reason_counter: true)
end end
it 'does not drop the pipeline' do it 'does not drop the pipeline' do
...@@ -97,6 +98,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do ...@@ -97,6 +98,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Size do
expect(step.break?).to be true expect(step.break?).to be true
end end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:size_limit_exceeded)
subject
end
end end
end end
......
...@@ -84,7 +84,7 @@ module Gitlab ...@@ -84,7 +84,7 @@ module Gitlab
end end
def metrics def metrics
@metrics ||= ::Gitlab::Ci::Pipeline::Metrics.new @metrics ||= ::Gitlab::Ci::Pipeline::Metrics
end end
def observe_creation_duration(duration) def observe_creation_duration(duration)
...@@ -97,6 +97,11 @@ module Gitlab ...@@ -97,6 +97,11 @@ module Gitlab
.observe({ source: pipeline.source.to_s }, pipeline.total_size) .observe({ source: pipeline.source.to_s }, pipeline.total_size)
end end
def increment_pipeline_failure_reason_counter(reason)
metrics.pipeline_failure_reason_counter
.increment(reason: (reason || :unknown_failure).to_s)
end
def dangling_build? def dangling_build?
%i[ondemand_dast_scan webide].include?(source) %i[ondemand_dast_scan webide].include?(source)
end end
......
...@@ -12,7 +12,8 @@ module Gitlab ...@@ -12,7 +12,8 @@ module Gitlab
end end
pipeline.add_error_message(message) pipeline.add_error_message(message)
pipeline.drop!(drop_reason) if drop_reason && persist_pipeline?
drop_pipeline!(drop_reason)
# TODO: consider not to rely on AR errors directly as they can be # TODO: consider not to rely on AR errors directly as they can be
# polluted with other unrelated errors (e.g. state machine) # polluted with other unrelated errors (e.g. state machine)
...@@ -24,8 +25,16 @@ module Gitlab ...@@ -24,8 +25,16 @@ module Gitlab
pipeline.add_warning_message(message) pipeline.add_warning_message(message)
end end
def persist_pipeline? private
command.save_incompleted && !pipeline.readonly?
def drop_pipeline!(drop_reason)
return if pipeline.readonly?
if drop_reason && command.save_incompleted
pipeline.drop!(drop_reason)
else
command.increment_pipeline_failure_reason_counter(drop_reason)
end
end end
end end
end end
......
...@@ -14,7 +14,7 @@ module Gitlab ...@@ -14,7 +14,7 @@ module Gitlab
end end
def counter def counter
::Gitlab::Ci::Pipeline::Metrics.new.pipelines_created_counter ::Gitlab::Ci::Pipeline::Metrics.pipelines_created_counter
end end
end end
end end
......
...@@ -4,9 +4,9 @@ module Gitlab ...@@ -4,9 +4,9 @@ module Gitlab
module Ci module Ci
module Pipeline module Pipeline
class Metrics class Metrics
include Gitlab::Utils::StrongMemoize extend Gitlab::Utils::StrongMemoize
def pipeline_creation_duration_histogram def self.pipeline_creation_duration_histogram
strong_memoize(:pipeline_creation_duration_histogram) do strong_memoize(:pipeline_creation_duration_histogram) do
name = :gitlab_ci_pipeline_creation_duration_seconds name = :gitlab_ci_pipeline_creation_duration_seconds
comment = 'Pipeline creation duration' comment = 'Pipeline creation duration'
...@@ -17,7 +17,7 @@ module Gitlab ...@@ -17,7 +17,7 @@ module Gitlab
end end
end end
def pipeline_size_histogram def self.pipeline_size_histogram
strong_memoize(:pipeline_size_histogram) do strong_memoize(:pipeline_size_histogram) do
name = :gitlab_ci_pipeline_size_builds name = :gitlab_ci_pipeline_size_builds
comment = 'Pipeline size' comment = 'Pipeline size'
...@@ -28,7 +28,7 @@ module Gitlab ...@@ -28,7 +28,7 @@ module Gitlab
end end
end end
def pipeline_processing_events_counter def self.pipeline_processing_events_counter
strong_memoize(:pipeline_processing_events_counter) do strong_memoize(:pipeline_processing_events_counter) do
name = :gitlab_ci_pipeline_processing_events_total name = :gitlab_ci_pipeline_processing_events_total
comment = 'Total amount of pipeline processing events' comment = 'Total amount of pipeline processing events'
...@@ -37,7 +37,7 @@ module Gitlab ...@@ -37,7 +37,7 @@ module Gitlab
end end
end end
def pipelines_created_counter def self.pipelines_created_counter
strong_memoize(:pipelines_created_count) do strong_memoize(:pipelines_created_count) do
name = :pipelines_created_total name = :pipelines_created_total
comment = 'Counter of pipelines created' comment = 'Counter of pipelines created'
...@@ -46,7 +46,7 @@ module Gitlab ...@@ -46,7 +46,7 @@ module Gitlab
end end
end end
def legacy_update_jobs_counter def self.legacy_update_jobs_counter
strong_memoize(:legacy_update_jobs_counter) do strong_memoize(:legacy_update_jobs_counter) do
name = :ci_legacy_update_jobs_as_retried_total name = :ci_legacy_update_jobs_as_retried_total
comment = 'Counter of occurrences when jobs were not being set as retried before update_retried' comment = 'Counter of occurrences when jobs were not being set as retried before update_retried'
...@@ -54,6 +54,24 @@ module Gitlab ...@@ -54,6 +54,24 @@ module Gitlab
Gitlab::Metrics.counter(name, comment) Gitlab::Metrics.counter(name, comment)
end end
end end
def self.pipeline_failure_reason_counter
strong_memoize(:pipeline_failure_reason_counter) do
name = :gitlab_ci_pipeline_failure_reasons
comment = 'Counter of pipeline failure reasons'
Gitlab::Metrics.counter(name, comment)
end
end
def self.job_failure_reason_counter
strong_memoize(:job_failure_reason_counter) do
name = :gitlab_ci_job_failure_reasons
comment = 'Counter of job failure reasons'
Gitlab::Metrics.counter(name, comment)
end
end
end end
end end
end end
......
...@@ -321,4 +321,25 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Command do ...@@ -321,4 +321,25 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Command do
it { is_expected.to be_falsey } it { is_expected.to be_falsey }
end end
end end
describe '#increment_pipeline_failure_reason_counter' do
let(:command) { described_class.new }
let(:reason) { :size_limit_exceeded }
subject { command.increment_pipeline_failure_reason_counter(reason) }
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: reason.to_s) }.by(1)
end
context 'when the reason is nil' do
let(:reason) { nil }
it 'increments the error metric with unknown_failure' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { subject }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end
end
end end
...@@ -11,7 +11,7 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do ...@@ -11,7 +11,7 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
let(:save_incompleted) { false } let(:save_incompleted) { false }
let(:command) do let(:command) do
double(:command, Gitlab::Ci::Pipeline::Chain::Command.new(
project: project, project: project,
pipeline_seed: pipeline_seed, pipeline_seed: pipeline_seed,
save_incompleted: save_incompleted save_incompleted: save_incompleted
...@@ -49,6 +49,11 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do ...@@ -49,6 +49,11 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.deployments_limit_exceeded?).to be true expect(pipeline.deployments_limit_exceeded?).to be true
end end
it 'calls increment_pipeline_failure_reason_counter' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { perform }.to change { counter.get(reason: 'deployments_limit_exceeded') }.by(1)
end
end end
context 'when not saving incomplete pipelines' do context 'when not saving incomplete pipelines' do
...@@ -71,6 +76,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do ...@@ -71,6 +76,12 @@ RSpec.describe ::Gitlab::Ci::Pipeline::Chain::Limit::Deployments do
expect(pipeline.errors.messages).to include(base: ['Pipeline has too many deployments! Requested 2, but the limit is 1.']) expect(pipeline.errors.messages).to include(base: ['Pipeline has too many deployments! Requested 2, but the limit is 1.'])
end end
it 'increments the error metric' do
expect(command).to receive(:increment_pipeline_failure_reason_counter).with(:deployments_limit_exceeded)
perform
end
end end
it 'logs the error' do it 'logs the error' do
......
...@@ -96,6 +96,11 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Populate do ...@@ -96,6 +96,11 @@ RSpec.describe Gitlab::Ci::Pipeline::Chain::Populate do
it 'wastes pipeline iid' do it 'wastes pipeline iid' do
expect(InternalId.ci_pipelines.where(project_id: project.id).last.last_value).to be > 0 expect(InternalId.ci_pipelines.where(project_id: project.id).last.last_value).to be > 0
end end
it 'increments the error metric' do
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { run_chain }.to change { counter.get(reason: 'unknown_failure') }.by(1)
end
end end
describe 'pipeline protect' do describe 'pipeline protect' do
......
...@@ -3836,6 +3836,16 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do ...@@ -3836,6 +3836,16 @@ RSpec.describe Ci::Pipeline, :mailer, factory_default: :keep do
pipeline.drop pipeline.drop
end end
end end
context 'with failure_reason' do
let(:pipeline) { create(:ci_pipeline, :running) }
let(:failure_reason) { 'config_error' }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc') }
it 'increments the counter with the failure_reason' do
expect { pipeline.drop!(failure_reason) }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
end end
end end
......
...@@ -629,30 +629,45 @@ RSpec.describe CommitStatus do ...@@ -629,30 +629,45 @@ RSpec.describe CommitStatus do
end end
end end
describe 'set failure_reason when drop' do describe '#drop' do
let(:commit_status) { create(:commit_status, :created) } let(:commit_status) { create(:commit_status, :created) }
let(:counter) { Gitlab::Metrics.counter(:gitlab_ci_job_failure_reasons, 'desc') }
let(:failure_reason) { reason.to_s }
subject do subject do
commit_status.drop!(reason) commit_status.drop!(reason)
commit_status commit_status
end end
shared_examples 'incrementing failure reason counter' do
it 'increments the counter with the failure_reason' do
expect { subject }.to change { counter.get(reason: failure_reason) }.by(1)
end
end
context 'when failure_reason is nil' do context 'when failure_reason is nil' do
let(:reason) { } let(:reason) { }
let(:failure_reason) { 'unknown_failure' }
it { is_expected.to be_unknown_failure } it { is_expected.to be_unknown_failure }
it_behaves_like 'incrementing failure reason counter'
end end
context 'when failure_reason is script_failure' do context 'when failure_reason is script_failure' do
let(:reason) { :script_failure } let(:reason) { :script_failure }
it { is_expected.to be_script_failure } it { is_expected.to be_script_failure }
it_behaves_like 'incrementing failure reason counter'
end end
context 'when failure_reason is unmet_prerequisites' do context 'when failure_reason is unmet_prerequisites' do
let(:reason) { :unmet_prerequisites } let(:reason) { :unmet_prerequisites }
it { is_expected.to be_unmet_prerequisites } it { is_expected.to be_unmet_prerequisites }
it_behaves_like 'incrementing failure reason counter'
end end
end end
......
...@@ -71,19 +71,21 @@ RSpec.describe Ci::CreatePipelineService do ...@@ -71,19 +71,21 @@ RSpec.describe Ci::CreatePipelineService do
end end
it 'increments the prometheus counter' do it 'increments the prometheus counter' do
expect(Gitlab::Metrics).to receive(:counter) counter = spy('pipeline created counter')
.with(:pipelines_created_total, "Counter of pipelines created")
.and_call_original allow(Gitlab::Ci::Pipeline::Metrics)
allow(Gitlab::Metrics).to receive(:counter).and_call_original # allow other counters .to receive(:pipelines_created_counter).and_return(counter)
pipeline pipeline
expect(counter).to have_received(:increment)
end end
it 'records pipeline size in a prometheus histogram' do it 'records pipeline size in a prometheus histogram' do
histogram = spy('pipeline size histogram') histogram = spy('pipeline size histogram')
allow(Gitlab::Ci::Pipeline::Metrics) allow(Gitlab::Ci::Pipeline::Metrics)
.to receive(:new).and_return(histogram) .to receive(:pipeline_size_histogram).and_return(histogram)
execute_service execute_service
...@@ -580,6 +582,13 @@ RSpec.describe Ci::CreatePipelineService do ...@@ -580,6 +582,13 @@ RSpec.describe Ci::CreatePipelineService do
it_behaves_like 'a failed pipeline' it_behaves_like 'a failed pipeline'
it 'increments the error metric' do
stub_ci_pipeline_yaml_file(ci_yaml)
counter = Gitlab::Metrics.counter(:gitlab_ci_pipeline_failure_reasons, 'desc')
expect { execute_service }.to change { counter.get(reason: 'config_error') }.by(1)
end
context 'when receive git commit' do context 'when receive git commit' do
before do before do
allow_any_instance_of(Ci::Pipeline).to receive(:git_commit_message) { message } allow_any_instance_of(Ci::Pipeline).to receive(:git_commit_message) { message }
......
...@@ -10,6 +10,14 @@ RSpec.describe Ci::ProcessPipelineService do ...@@ -10,6 +10,14 @@ RSpec.describe Ci::ProcessPipelineService do
create(:ci_empty_pipeline, ref: 'master', project: project) create(:ci_empty_pipeline, ref: 'master', project: project)
end end
let(:pipeline_processing_events_counter) { double(increment: true) }
let(:legacy_update_jobs_counter) { double(increment: true) }
let(:metrics) do
double(pipeline_processing_events_counter: pipeline_processing_events_counter,
legacy_update_jobs_counter: legacy_update_jobs_counter)
end
subject { described_class.new(pipeline) } subject { described_class.new(pipeline) }
before do before do
...@@ -17,22 +25,13 @@ RSpec.describe Ci::ProcessPipelineService do ...@@ -17,22 +25,13 @@ RSpec.describe Ci::ProcessPipelineService do
stub_not_protect_default_branch stub_not_protect_default_branch
project.add_developer(user) project.add_developer(user)
allow(subject).to receive(:metrics).and_return(metrics)
end end
describe 'processing events counter' do describe 'processing events counter' do
let(:metrics) { double('pipeline metrics') }
let(:counter) { double('events counter') }
before do
allow(subject)
.to receive(:metrics).and_return(metrics)
allow(metrics)
.to receive(:pipeline_processing_events_counter)
.and_return(counter)
end
it 'increments processing events counter' do it 'increments processing events counter' do
expect(counter).to receive(:increment) expect(pipeline_processing_events_counter).to receive(:increment)
subject.execute subject.execute
end end
...@@ -64,33 +63,22 @@ RSpec.describe Ci::ProcessPipelineService do ...@@ -64,33 +63,22 @@ RSpec.describe Ci::ProcessPipelineService do
expect(all_builds.retried).to contain_exactly(build_retried) expect(all_builds.retried).to contain_exactly(build_retried)
end end
context 'counter ci_legacy_update_jobs_as_retried_total' do it 'increments the counter' do
let(:counter) { double(increment: true) } expect(legacy_update_jobs_counter).to receive(:increment)
subject.execute
end
context 'when the previous build has already retried column true' do
before do before do
allow(Gitlab::Metrics).to receive(:counter).and_call_original build_retried.update_columns(retried: true)
allow(Gitlab::Metrics).to receive(:counter)
.with(:ci_legacy_update_jobs_as_retried_total, anything)
.and_return(counter)
end end
it 'increments the counter' do it 'does not increment the counter' do
expect(counter).to receive(:increment) expect(legacy_update_jobs_counter).not_to receive(:increment)
subject.execute subject.execute
end end
context 'when the previous build has already retried column true' do
before do
build_retried.update_columns(retried: true)
end
it 'does not increment the counter' do
expect(counter).not_to receive(:increment)
subject.execute
end
end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment