Commit fd24f258 authored by Kamil Trzciński's avatar Kamil Trzciński

Make `jobs/request` to be resillient

This makes this request and service
to be resillient to data integrity failures
and properly present them to user.
parent 168db043
...@@ -35,6 +35,10 @@ module Ci ...@@ -35,6 +35,10 @@ module Ci
refspecs: -> (build) { build.merge_request_ref? } refspecs: -> (build) { build.merge_request_ref? }
}.freeze }.freeze
DEFAULT_RETRIES = {
scheduler_failure: 2
}.freeze
has_one :deployment, as: :deployable, class_name: 'Deployment' has_one :deployment, as: :deployable, class_name: 'Deployment'
has_many :trace_sections, class_name: 'Ci::BuildTraceSection' has_many :trace_sections, class_name: 'Ci::BuildTraceSection'
has_many :trace_chunks, class_name: 'Ci::BuildTraceChunk', foreign_key: :build_id has_many :trace_chunks, class_name: 'Ci::BuildTraceChunk', foreign_key: :build_id
...@@ -372,18 +376,25 @@ module Ci ...@@ -372,18 +376,25 @@ module Ci
pipeline.builds.retried.where(name: self.name).count pipeline.builds.retried.where(name: self.name).count
end end
def retries_max def retry_failure?
normalized_retry.fetch(:max, 0) max_allowed_retries = nil
max_allowed_retries ||= options_retry_max if retry_on_reason_or_always?
max_allowed_retries ||= DEFAULT_RETRIES.fetch(failure_reason.to_sym, 0)
max_allowed_retries > 0 && retries_count < max_allowed_retries
end end
def retry_when def options_retry_max
normalized_retry.fetch(:when, ['always']) options_retry[:max]
end end
def retry_failure? def options_retry_when
return false if retries_max.zero? || retries_count >= retries_max options_retry.fetch(:when, ['always'])
end
retry_when.include?('always') || retry_when.include?(failure_reason.to_s) def retry_on_reason_or_always?
options_retry_when.include?(failure_reason.to_s) ||
options_retry_when.include?('always')
end end
def latest? def latest?
...@@ -831,6 +842,13 @@ module Ci ...@@ -831,6 +842,13 @@ module Ci
:creating :creating
end end
# Consider this object to have a structural integrity problems
def doom!
update_columns(
status: :failed,
failure_reason: :data_integrity_failure)
end
private private
def successful_deployment_status def successful_deployment_status
...@@ -875,8 +893,8 @@ module Ci ...@@ -875,8 +893,8 @@ module Ci
# format, but builds created before GitLab 11.5 and saved in database still # format, but builds created before GitLab 11.5 and saved in database still
# have the old integer only format. This method returns the retry option # have the old integer only format. This method returns the retry option
# normalized as a hash in 11.5+ format. # normalized as a hash in 11.5+ format.
def normalized_retry def options_retry
strong_memoize(:normalized_retry) do strong_memoize(:options_retry) do
value = options&.dig(:retry) value = options&.dig(:retry)
value = value.is_a?(Integer) ? { max: value } : value.to_h value = value.is_a?(Integer) ? { max: value } : value.to_h
value.with_indifferent_access value.with_indifferent_access
......
...@@ -15,7 +15,9 @@ module CommitStatusEnums ...@@ -15,7 +15,9 @@ module CommitStatusEnums
stale_schedule: 7, stale_schedule: 7,
job_execution_timeout: 8, job_execution_timeout: 8,
archived_failure: 9, archived_failure: 9,
unmet_prerequisites: 10 unmet_prerequisites: 10,
scheduler_failure: 11,
data_integrity_failure: 12
} }
end end
end end
......
...@@ -11,7 +11,9 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated ...@@ -11,7 +11,9 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated
stale_schedule: 'Delayed job could not be executed by some reason, please try again', stale_schedule: 'Delayed job could not be executed by some reason, please try again',
job_execution_timeout: 'The script exceeded the maximum execution time set for the job', job_execution_timeout: 'The script exceeded the maximum execution time set for the job',
archived_failure: 'The job is archived and cannot be run', archived_failure: 'The job is archived and cannot be run',
unmet_prerequisites: 'The job failed to complete prerequisite tasks' unmet_prerequisites: 'The job failed to complete prerequisite tasks',
scheduler_failure: 'The scheduler failed to assign job to the runner, please try again or contact system administrator',
data_integrity_failure: 'There has been a structural integrity problem detected, please contact system administrator'
}.freeze }.freeze
private_constant :CALLOUT_FAILURE_MESSAGES private_constant :CALLOUT_FAILURE_MESSAGES
...@@ -33,6 +35,6 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated ...@@ -33,6 +35,6 @@ class CommitStatusPresenter < Gitlab::View::Presenter::Delegated
end end
def unrecoverable? def unrecoverable?
script_failure? || missing_dependency_failure? || archived_failure? script_failure? || missing_dependency_failure? || archived_failure? || scheduler_failure? || data_integrity_failure?
end end
end end
...@@ -42,15 +42,34 @@ module Ci ...@@ -42,15 +42,34 @@ module Ci
end end
builds.each do |build| builds.each do |build|
next unless runner.can_pick?(build) result = process_build(build, params)
next unless result
if result.valid?
register_success(result.build)
return result
else
# The usage of valid: is described in
# handling of ActiveRecord::StaleObjectError
valid = false
end
end
register_failure
Result.new(nil, valid)
end
# rubocop: enable CodeReuse/ActiveRecord
private
def process_build(build, params)
return unless runner.can_pick?(build)
begin
# In case when 2 runners try to assign the same build, second runner will be declined # In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method. # with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params) if assign_runner!(build, params)
register_success(build) Result.new(build, true)
return Result.new(build, true)
end end
rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting # We are looping to find another build that is not conflicting
...@@ -62,16 +81,16 @@ module Ci ...@@ -62,16 +81,16 @@ module Ci
# In case we hit the concurrency-access lock, # In case we hit the concurrency-access lock,
# we still have to return 409 in the end, # we still have to return 409 in the end,
# to make sure that this is properly handled by runner. # to make sure that this is properly handled by runner.
valid = false Result.new(nil, false)
end rescue => ex
end raise ex unless Feature.enabled?(:ci_doom_build, default_enabled: true)
register_failure scheduler_failure!(build)
Result.new(nil, valid) track_exception_for_build(ex, build)
end
# rubocop: enable CodeReuse/ActiveRecord
private # skip, and move to next one
nil
end
def assign_runner!(build, params) def assign_runner!(build, params)
build.runner_id = runner.id build.runner_id = runner.id
...@@ -96,6 +115,28 @@ module Ci ...@@ -96,6 +115,28 @@ module Ci
true true
end end
def scheduler_failure!(build)
Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject|
subject.drop!(:scheduler_failure)
end
rescue => ex
build.doom!
# This requires extra exception, otherwise we would loose information
# why we cannot perform `scheduler_failure`
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::Sentry.track_acceptable_exception(ex, extra: {
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
})
end
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def builds_for_shared_runner def builds_for_shared_runner
new_builds. new_builds.
......
---
title: Make `jobs/request` to be resillient
merge_request: 19150
author:
type: fixed
...@@ -16,7 +16,9 @@ module Gitlab ...@@ -16,7 +16,9 @@ module Gitlab
stale_schedule: 'stale schedule', stale_schedule: 'stale schedule',
job_execution_timeout: 'job execution timeout', job_execution_timeout: 'job execution timeout',
archived_failure: 'archived failure', archived_failure: 'archived failure',
unmet_prerequisites: 'unmet prerequisites' unmet_prerequisites: 'unmet prerequisites',
scheduler_failure: 'scheduler failure',
data_integrity_failure: 'data integrity failure'
}.freeze }.freeze
private_constant :REASONS private_constant :REASONS
......
...@@ -1587,7 +1587,7 @@ describe Ci::Build do ...@@ -1587,7 +1587,7 @@ describe Ci::Build do
end end
end end
describe '#retries_max' do describe '#options_retry_max' do
context 'with retries max config option' do context 'with retries max config option' do
subject { create(:ci_build, options: { retry: { max: 1 } }) } subject { create(:ci_build, options: { retry: { max: 1 } }) }
...@@ -1597,7 +1597,7 @@ describe Ci::Build do ...@@ -1597,7 +1597,7 @@ describe Ci::Build do
end end
it 'returns the number of configured max retries' do it 'returns the number of configured max retries' do
expect(subject.retries_max).to eq 1 expect(subject.options_retry_max).to eq 1
end end
end end
...@@ -1607,7 +1607,7 @@ describe Ci::Build do ...@@ -1607,7 +1607,7 @@ describe Ci::Build do
end end
it 'returns the number of configured max retries' do it 'returns the number of configured max retries' do
expect(subject.retries_max).to eq 1 expect(subject.options_retry_max).to eq 1
end end
end end
end end
...@@ -1615,16 +1615,16 @@ describe Ci::Build do ...@@ -1615,16 +1615,16 @@ describe Ci::Build do
context 'without retries max config option' do context 'without retries max config option' do
subject { create(:ci_build) } subject { create(:ci_build) }
it 'returns zero' do it 'returns nil' do
expect(subject.retries_max).to eq 0 expect(subject.options_retry_max).to be_nil
end end
end end
context 'when build is degenerated' do context 'when build is degenerated' do
subject { create(:ci_build, :degenerated) } subject { create(:ci_build, :degenerated) }
it 'returns zero' do it 'returns nil' do
expect(subject.retries_max).to eq 0 expect(subject.options_retry_max).to be_nil
end end
end end
...@@ -1632,17 +1632,17 @@ describe Ci::Build do ...@@ -1632,17 +1632,17 @@ describe Ci::Build do
subject { create(:ci_build, options: { retry: 1 }) } subject { create(:ci_build, options: { retry: 1 }) }
it 'returns the number of configured max retries' do it 'returns the number of configured max retries' do
expect(subject.retries_max).to eq 1 expect(subject.options_retry_max).to eq 1
end end
end end
end end
describe '#retry_when' do describe '#options_retry_when' do
context 'with retries when config option' do context 'with retries when config option' do
subject { create(:ci_build, options: { retry: { when: ['some_reason'] } }) } subject { create(:ci_build, options: { retry: { when: ['some_reason'] } }) }
it 'returns the configured when' do it 'returns the configured when' do
expect(subject.retry_when).to eq ['some_reason'] expect(subject.options_retry_when).to eq ['some_reason']
end end
end end
...@@ -1650,7 +1650,7 @@ describe Ci::Build do ...@@ -1650,7 +1650,7 @@ describe Ci::Build do
subject { create(:ci_build) } subject { create(:ci_build) }
it 'returns always array' do it 'returns always array' do
expect(subject.retry_when).to eq ['always'] expect(subject.options_retry_when).to eq ['always']
end end
end end
...@@ -1658,72 +1658,38 @@ describe Ci::Build do ...@@ -1658,72 +1658,38 @@ describe Ci::Build do
subject { create(:ci_build, options: { retry: 1 }) } subject { create(:ci_build, options: { retry: 1 }) }
it 'returns always array' do it 'returns always array' do
expect(subject.retry_when).to eq ['always'] expect(subject.options_retry_when).to eq ['always']
end end
end end
end end
describe '#retry_failure?' do describe '#retry_failure?' do
subject { create(:ci_build) } using RSpec::Parameterized::TableSyntax
context 'when retries max is zero' do
before do
expect(subject).to receive(:retries_max).at_least(:once).and_return(0)
end
it 'returns false' do
expect(subject.retry_failure?).to eq false
end
end
context 'when retries max equals retries count' do
before do
expect(subject).to receive(:retries_max).at_least(:once).and_return(1)
expect(subject).to receive(:retries_count).at_least(:once).and_return(1)
end
it 'returns false' do
expect(subject.retry_failure?).to eq false
end
end
context 'when retries max is higher than retries count' do let(:build) { create(:ci_build) }
before do
expect(subject).to receive(:retries_max).at_least(:once).and_return(2)
expect(subject).to receive(:retries_count).at_least(:once).and_return(1)
end
context 'and retry when is always' do subject { build.retry_failure? }
before do
expect(subject).to receive(:retry_when).at_least(:once).and_return(['always'])
end
it 'returns true' do where(:description, :retry_count, :options, :failure_reason, :result) do
expect(subject.retry_failure?).to eq true "retries are disabled" | 0 | { max: 0 } | nil | false
end "max equals count" | 2 | { max: 2 } | nil | false
"max is higher than count" | 1 | { max: 2 } | nil | true
"matching failure reason" | 0 | { when: %w[api_failure], max: 2 } | :api_failure | true
"not matching with always" | 0 | { when: %w[always], max: 2 } | :api_failure | true
"not matching reason" | 0 | { when: %w[script_error], max: 2 } | :api_failure | false
"scheduler failure override" | 1 | { when: %w[scheduler_failure], max: 1 } | :scheduler_failure | false
"default for scheduler failure" | 1 | {} | :scheduler_failure | true
end end
context 'and retry when includes the failure_reason' do with_them do
before do before do
expect(subject).to receive(:failure_reason).at_least(:once).and_return('some_reason') allow(build).to receive(:retries_count) { retry_count }
expect(subject).to receive(:retry_when).at_least(:once).and_return(['some_reason'])
end
it 'returns true' do build.options[:retry] = options
expect(subject.retry_failure?).to eq true build.failure_reason = failure_reason
end
end end
context 'and retry when does not include failure_reason' do it { is_expected.to eq(result) }
before do
expect(subject).to receive(:failure_reason).at_least(:once).and_return('some_reason')
expect(subject).to receive(:retry_when).at_least(:once).and_return(['some', 'other failure'])
end
it 'returns false' do
expect(subject.retry_failure?).to eq false
end
end
end end
end end
end end
......
...@@ -269,7 +269,7 @@ describe Ci::BuildPresenter do ...@@ -269,7 +269,7 @@ describe Ci::BuildPresenter do
let(:build) { create(:ci_build, :failed, :script_failure) } let(:build) { create(:ci_build, :failed, :script_failure) }
context 'when is a script or missing dependency failure' do context 'when is a script or missing dependency failure' do
let(:failure_reasons) { %w(script_failure missing_dependency_failure archived_failure) } let(:failure_reasons) { %w(script_failure missing_dependency_failure archived_failure scheduler_failure data_integrity_failure) }
it 'returns false' do it 'returns false' do
failure_reasons.each do |failure_reason| failure_reasons.each do |failure_reason|
......
...@@ -773,8 +773,8 @@ describe Ci::CreatePipelineService do ...@@ -773,8 +773,8 @@ describe Ci::CreatePipelineService do
it 'correctly creates builds with auto-retry value configured' do it 'correctly creates builds with auto-retry value configured' do
expect(pipeline).to be_persisted expect(pipeline).to be_persisted
expect(rspec_job.retries_max).to eq 2 expect(rspec_job.options_retry_max).to eq 2
expect(rspec_job.retry_when).to eq ['always'] expect(rspec_job.options_retry_when).to eq ['always']
end end
end end
...@@ -783,8 +783,8 @@ describe Ci::CreatePipelineService do ...@@ -783,8 +783,8 @@ describe Ci::CreatePipelineService do
it 'correctly creates builds with auto-retry value configured' do it 'correctly creates builds with auto-retry value configured' do
expect(pipeline).to be_persisted expect(pipeline).to be_persisted
expect(rspec_job.retries_max).to eq 2 expect(rspec_job.options_retry_max).to eq 2
expect(rspec_job.retry_when).to eq ['runner_system_failure'] expect(rspec_job.options_retry_when).to eq ['runner_system_failure']
end end
end end
end end
......
...@@ -502,6 +502,57 @@ module Ci ...@@ -502,6 +502,57 @@ module Ci
end end
end end
context 'when build has data integrity problem' do
let!(:pending_job) do
create(:ci_build, :pending, pipeline: pipeline)
end
before do
pending_job.update_columns(options: "string")
end
subject { execute(specific_runner, {}) }
it 'does drop the build and logs both failures' do
expect(Gitlab::Sentry).to receive(:track_acceptable_exception)
.with(anything, a_hash_including(extra: a_hash_including(build_id: pending_job.id)))
.twice
.and_call_original
expect(subject).to be_nil
pending_job.reload
expect(pending_job).to be_failed
expect(pending_job).to be_data_integrity_failure
end
end
context 'when build fails to be run!' do
let!(:pending_job) do
create(:ci_build, :pending, pipeline: pipeline)
end
before do
expect_any_instance_of(Ci::Build).to receive(:run!)
.and_raise(RuntimeError, 'scheduler error')
end
subject { execute(specific_runner, {}) }
it 'does drop the build and logs failure' do
expect(Gitlab::Sentry).to receive(:track_acceptable_exception)
.with(anything, a_hash_including(extra: a_hash_including(build_id: pending_job.id)))
.once
.and_call_original
expect(subject).to be_nil
pending_job.reload
expect(pending_job).to be_failed
expect(pending_job).to be_scheduler_failure
end
end
context 'when an exception is raised during a persistent ref creation' do context 'when an exception is raised during a persistent ref creation' do
before do before do
allow_any_instance_of(Ci::PersistentRef).to receive(:exist?) { false } allow_any_instance_of(Ci::PersistentRef).to receive(:exist?) { false }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment