Commit e7739e5c authored by Grzegorz Bizon's avatar Grzegorz Bizon

Add build status update exponential backoff mechanism

This commit adds a class that will be used to calculate an exponential
backoff in seconds that we will pass to a runner to avoid overloading
the API in case of object storage slowness.
parent 8787a3c4
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
module Ci module Ci
class UpdateBuildStateService class UpdateBuildStateService
Result = Struct.new(:status, keyword_init: true) Result = Struct.new(:status, :backoff, keyword_init: true)
ACCEPT_TIMEOUT = 5.minutes.freeze ACCEPT_TIMEOUT = 5.minutes.freeze
...@@ -28,7 +28,9 @@ module Ci ...@@ -28,7 +28,9 @@ module Ci
private private
def accept_build_state! def accept_build_state!
if Time.current - ensure_pending_state.created_at > ACCEPT_TIMEOUT state_created = ensure_pending_state.created_at
if Time.current - state_created > ACCEPT_TIMEOUT
metrics.increment_trace_operation(operation: :discarded) metrics.increment_trace_operation(operation: :discarded)
return update_build_state! return update_build_state!
...@@ -40,7 +42,9 @@ module Ci ...@@ -40,7 +42,9 @@ module Ci
metrics.increment_trace_operation(operation: :accepted) metrics.increment_trace_operation(operation: :accepted)
Result.new(status: 202) ::Gitlab::Ci::Runner::Backoff.new(state_created).then do |backoff|
Result.new(status: 202, backoff: backoff.to_seconds)
end
end end
def overwrite_trace! def overwrite_trace!
......
...@@ -181,6 +181,7 @@ module API ...@@ -181,6 +181,7 @@ module API
.new(job, declared_params(include_missing: false)) .new(job, declared_params(include_missing: false))
service.execute.then do |result| service.execute.then do |result|
header 'X-GitLab-Trace-Update-Interval', result.backoff
status result.status status result.status
end end
end end
......
# frozen_string_literal: true
module Gitlab
module Ci
module Runner
##
# Runner Backoff class is an implementation of an exponential backoff
# used when a runner communicates with GitLab. We typically use it when a
# runner retries sending a build status after we created a build pending
# state.
#
# Backoff is calculated based on the backoff slot which is always a power
# of 2:
#
# 0s - 3s duration -> 1 second backoff
# 4s - 7s duration -> 2 seconds backoff
# 8s - 15s duration -> 4 seconds backoff
# 16s - 31s duration -> 8 seconds backoff
# 32s - 63s duration -> 16 seconds backoff
# 64s - 127s duration -> 32 seconds backoff
# 127s - 256s+ duration -> 64 seconds backoff
#
# It means that first 15 requests made by a runner will need to respect
# following backoffs:
#
# 0s -> 1 second backoff (backoff started, slot 0, 2^0 backoff)
# 1s -> 1 second backoff
# 2s -> 1 second backoff
# 3s -> 1 seconds backoff
# (slot 1 - 2^1 backoff)
# 4s -> 2 seconds backoff
# 6s -> 2 seconds backoff
# (slot 2 - 2^2 backoff)
# 8s -> 4 seconds backoff
# 12s -> 4 seconds backoff
# (slot 3 - 2^3 backoff)
# 16s -> 8 seconds backoff
# 24s -> 8 seconds backoff
# (slot 4 - 2^4 backoff)
# 32s -> 16 seconds backoff
# 48s -> 16 seconds backoff
# (slot 5 - 2^5 backoff)
# 64s -> 32 seconds backoff
# 96s -> 32 seconds backoff
# (slot 6 - 2^6 backoff)
# 128s -> 64 seconds backoff
#
# There is a cap on the backoff - it will never exceed 64 seconds.
#
class Backoff
def initialize(started)
@started = started
if duration < 0
raise ArgumentError, 'backoff duration negative'
end
end
def duration
Time.current - @started
end
def slot
return 0 if duration <= 1
Math.log(duration, 2).floor - 1
end
def to_seconds
2**[slot, 6].min
end
end
end
end
end
# frozen_string_literal: true
require 'fast_spec_helper'
require 'rspec-parameterized'
require 'active_support/testing/time_helpers'
RSpec.describe Gitlab::Ci::Runner::Backoff do
include ActiveSupport::Testing::TimeHelpers
describe '#duration' do
it 'returns backoff duration from start' do
freeze_time do
described_class.new(5.minutes.ago).then do |backoff|
expect(backoff.duration).to eq 5.minutes
end
end
end
end
describe '#slot' do
using RSpec::Parameterized::TableSyntax
where(:started, :slot) do
1 | 0
2 | 0
3 | 0
4 | 1
5 | 1
6 | 1
7 | 1
8 | 2
9 | 2
10 | 2
15 | 2
16 | 3
31 | 3
32 | 4
63 | 4
64 | 5
127 | 5
128 | 6
250 | 6
310 | 7
520 | 8
999 | 8
end
with_them do
it 'falls into an appropaite backoff slot' do
freeze_time do
backoff = described_class.new(started.seconds.ago)
expect(backoff.slot).to eq slot
end
end
end
end
describe '#to_seconds' do
using RSpec::Parameterized::TableSyntax
where(:started, :backoff) do
1 | 1
2 | 1
3 | 1
4 | 2
5 | 2
6 | 2
7 | 2
8 | 4
9 | 4
10 | 4
15 | 4
16 | 8
31 | 8
32 | 16
63 | 16
64 | 32
127 | 32
128 | 64
250 | 64
310 | 64
520 | 64
999 | 64
end
with_them do
it 'calculates backoff based on an appropriate slot' do
freeze_time do
described_class.new(started.seconds.ago).then do |delay|
expect(delay.to_seconds).to eq backoff
end
end
end
end
end
end
...@@ -46,64 +46,59 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -46,64 +46,59 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
end end
context 'when status is given' do context 'when status is given' do
it 'mark job as succeeded' do it 'marks job as succeeded' do
update_job(state: 'success') update_job(state: 'success')
job.reload expect(job.reload).to be_success
expect(job).to be_success expect(response.header).not_to have_key('X-GitLab-Trace-Update-Interval')
end end
it 'mark job as failed' do it 'marks job as failed' do
update_job(state: 'failed') update_job(state: 'failed')
job.reload expect(job.reload).to be_failed
expect(job).to be_failed
expect(job).to be_unknown_failure expect(job).to be_unknown_failure
expect(response.header).not_to have_key('X-GitLab-Trace-Update-Interval')
end end
context 'when failure_reason is script_failure' do context 'when failure_reason is script_failure' do
before do before do
update_job(state: 'failed', failure_reason: 'script_failure') update_job(state: 'failed', failure_reason: 'script_failure')
job.reload
end end
it { expect(job).to be_script_failure } it { expect(job.reload).to be_script_failure }
end end
context 'when failure_reason is runner_system_failure' do context 'when failure_reason is runner_system_failure' do
before do before do
update_job(state: 'failed', failure_reason: 'runner_system_failure') update_job(state: 'failed', failure_reason: 'runner_system_failure')
job.reload
end end
it { expect(job).to be_runner_system_failure } it { expect(job.reload).to be_runner_system_failure }
end end
context 'when failure_reason is unrecognized value' do context 'when failure_reason is unrecognized value' do
before do before do
update_job(state: 'failed', failure_reason: 'what_is_this') update_job(state: 'failed', failure_reason: 'what_is_this')
job.reload
end end
it { expect(job).to be_unknown_failure } it { expect(job.reload).to be_unknown_failure }
end end
context 'when failure_reason is job_execution_timeout' do context 'when failure_reason is job_execution_timeout' do
before do before do
update_job(state: 'failed', failure_reason: 'job_execution_timeout') update_job(state: 'failed', failure_reason: 'job_execution_timeout')
job.reload
end end
it { expect(job).to be_job_execution_timeout } it { expect(job.reload).to be_job_execution_timeout }
end end
context 'when failure_reason is unmet_prerequisites' do context 'when failure_reason is unmet_prerequisites' do
before do before do
update_job(state: 'failed', failure_reason: 'unmet_prerequisites') update_job(state: 'failed', failure_reason: 'unmet_prerequisites')
job.reload
end end
it { expect(job).to be_unmet_prerequisites } it { expect(job.reload).to be_unmet_prerequisites }
end end
context 'when unmigrated live trace chunks exist' do context 'when unmigrated live trace chunks exist' do
...@@ -119,6 +114,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -119,6 +114,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
expect(job.pending_state).to be_present expect(job.pending_state).to be_present
expect(response).to have_gitlab_http_status(:accepted) expect(response).to have_gitlab_http_status(:accepted)
expect(response.header['X-GitLab-Trace-Update-Interval']).to be > 0
end end
end end
...@@ -151,6 +147,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do ...@@ -151,6 +147,7 @@ RSpec.describe API::Ci::Runner, :clean_gitlab_redis_shared_state do
expect(job.reload).to be_success expect(job.reload).to be_success
expect(job.pending_state).not_to be_present expect(job.pending_state).not_to be_present
expect(response).to have_gitlab_http_status(:ok) expect(response).to have_gitlab_http_status(:ok)
expect(response.header).not_to have_key('X-GitLab-Trace-Update-Interval')
end end
end end
end end
......
...@@ -95,6 +95,12 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -95,6 +95,12 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(result.status).to eq 200 expect(result.status).to eq 200
end end
it 'does not set a backoff value' do
result = subject.execute
expect(result.backoff).to be_nil
end
it 'increments trace finalized operation metric' do it 'increments trace finalized operation metric' do
subject.execute subject.execute
...@@ -121,6 +127,12 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -121,6 +127,12 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(result.status).to eq 202 expect(result.status).to eq 202
end end
it 'sets a request backoff value' do
result = subject.execute
expect(result.backoff.to_i).to be > 0
end
it 'schedules live chunks for migration' do it 'schedules live chunks for migration' do
expect(Ci::BuildTraceChunkFlushWorker) expect(Ci::BuildTraceChunkFlushWorker)
.to receive(:perform_async) .to receive(:perform_async)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment