Commit c012265e authored by Grzegorz Bizon's avatar Grzegorz Bizon

Merge branch 'ci_temporary_lock' into 'master'

Reduce resource contention on picking CI builds during `jobs/request` [RUN ALL RSPEC] [RUN AS-IF-FOSS]

See merge request gitlab-org/gitlab!55202
parents 904832a6 b6c7c198
...@@ -6,6 +6,8 @@ module Ci ...@@ -6,6 +6,8 @@ module Ci
class RegisterJobService class RegisterJobService
attr_reader :runner, :metrics attr_reader :runner, :metrics
TEMPORARY_LOCK_TIMEOUT = 3.seconds
Result = Struct.new(:build, :build_json, :valid?) Result = Struct.new(:build, :build_json, :valid?)
def initialize(runner) def initialize(runner)
...@@ -31,6 +33,20 @@ module Ci ...@@ -31,6 +33,20 @@ module Ci
depth += 1 depth += 1
@metrics.increment_queue_operation(:queue_iteration) @metrics.increment_queue_operation(:queue_iteration)
# We read builds from replicas
# It is likely that some other concurrent connection is processing
# a given build at a given moment. To avoid an expensive compute
# we perform an exclusive lease on Redis to acquire a build temporarily
unless acquire_temporary_lock(build.id)
@metrics.increment_queue_operation(:build_temporary_locked)
# We failed to acquire lock
# - our queue is not complete as some resources are locked temporarily
# - we need to re-process it again to ensure that all builds are handled
valid = false
next
end
result = process_build(build, params) result = process_build(build, params)
next unless result next unless result
...@@ -170,6 +186,16 @@ module Ci ...@@ -170,6 +186,16 @@ module Ci
!failure_reason !failure_reason
end end
def acquire_temporary_lock(build_id)
return true unless Feature.enabled?(:ci_register_job_temporary_lock, runner)
key = "build/register/#{build_id}"
Gitlab::ExclusiveLease
.new(key, timeout: TEMPORARY_LOCK_TIMEOUT.to_i)
.try_obtain
end
def scheduler_failure!(build) def scheduler_failure!(build)
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject| Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
subject.drop!(:scheduler_failure) subject.drop!(:scheduler_failure)
......
---
title: Improve build contention
merge_request: 55202
author:
type: performance
---
name: ci_register_job_temporary_lock
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/55202
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/323180
milestone: '13.10'
type: development
group: group::memory
default_enabled: false
...@@ -20,6 +20,7 @@ module Gitlab ...@@ -20,6 +20,7 @@ module Gitlab
:build_can_pick, :build_can_pick,
:build_not_pick, :build_not_pick,
:build_not_pending, :build_not_pending,
:build_temporary_locked,
:build_conflict_lock, :build_conflict_lock,
:build_conflict_exception, :build_conflict_exception,
:build_conflict_transition, :build_conflict_transition,
......
...@@ -600,6 +600,47 @@ module Ci ...@@ -600,6 +600,47 @@ module Ci
expect(execute(specific_runner)).to eq(pending_job) expect(execute(specific_runner)).to eq(pending_job)
end end
end end
context 'when ci_register_job_temporary_lock is enabled' do
before do
stub_feature_flags(ci_register_job_temporary_lock: true)
allow(Gitlab::Ci::Queue::Metrics.queue_operations_total).to receive(:increment)
end
context 'when a build is temporarily locked' do
let(:service) { described_class.new(specific_runner) }
before do
service.send(:acquire_temporary_lock, pending_job.id)
end
it 'skips this build and marks queue as invalid' do
expect(Gitlab::Ci::Queue::Metrics.queue_operations_total).to receive(:increment)
.with(operation: :queue_iteration)
expect(Gitlab::Ci::Queue::Metrics.queue_operations_total).to receive(:increment)
.with(operation: :build_temporary_locked)
expect(service.execute).not_to be_valid
end
context 'when there is another build in queue' do
let!(:next_pending_job) { create(:ci_build, pipeline: pipeline) }
it 'skips this build and picks another build' do
expect(Gitlab::Ci::Queue::Metrics.queue_operations_total).to receive(:increment)
.with(operation: :queue_iteration).twice
expect(Gitlab::Ci::Queue::Metrics.queue_operations_total).to receive(:increment)
.with(operation: :build_temporary_locked)
result = service.execute
expect(result.build).to eq(next_pending_job)
expect(result).to be_valid
end
end
end
end
end end
context 'when ci_register_job_service_one_by_one is enabled' do context 'when ci_register_job_service_one_by_one is enabled' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment