Commit ac847fba authored by Aleksei Lipniagov's avatar Aleksei Lipniagov Committed by Bob Van Landuyt

Log Optimistic Locks with retries

Use structured logging for gathering info on Optimistic Locks which were
not able to succeed without retries.
parent e8043280
...@@ -573,7 +573,7 @@ module Ci ...@@ -573,7 +573,7 @@ module Ci
end end
def cancel_running(retries: nil) def cancel_running(retries: nil)
retry_optimistic_lock(cancelable_statuses, retries) do |cancelable| retry_optimistic_lock(cancelable_statuses, retries, name: 'ci_pipeline_cancel_running') do |cancelable|
cancelable.find_each do |job| cancelable.find_each do |job|
yield(job) if block_given? yield(job) if block_given?
job.cancel job.cancel
...@@ -744,7 +744,7 @@ module Ci ...@@ -744,7 +744,7 @@ module Ci
end end
def set_status(new_status) def set_status(new_status)
retry_optimistic_lock(self) do retry_optimistic_lock(self, name: 'ci_pipeline_set_status') do
case new_status case new_status
when 'created' then nil when 'created' then nil
when 'waiting_for_resource' then request_resource when 'waiting_for_resource' then request_resource
......
...@@ -62,7 +62,7 @@ module Ci ...@@ -62,7 +62,7 @@ module Ci
end end
def update_status_by!(pipeline) def update_status_by!(pipeline)
retry_lock(self) do retry_lock(self, name: 'ci_ref_update_status_by') do
next unless last_finished_pipeline_id == pipeline.id next unless last_finished_pipeline_id == pipeline.id
case pipeline.status case pipeline.status
......
...@@ -84,7 +84,7 @@ module Ci ...@@ -84,7 +84,7 @@ module Ci
end end
def set_status(new_status) def set_status(new_status)
retry_optimistic_lock(self) do retry_optimistic_lock(self, name: 'ci_stage_set_status') do
case new_status case new_status
when 'created' then nil when 'created' then nil
when 'waiting_for_resource' then request_resource when 'waiting_for_resource' then request_resource
......
...@@ -242,7 +242,7 @@ class Environment < ApplicationRecord ...@@ -242,7 +242,7 @@ class Environment < ApplicationRecord
def cancel_deployment_jobs! def cancel_deployment_jobs!
jobs = active_deployments.with_deployable jobs = active_deployments.with_deployable
jobs.each do |deployment| jobs.each do |deployment|
Gitlab::OptimisticLocking.retry_lock(deployment.deployable) do |deployable| Gitlab::OptimisticLocking.retry_lock(deployment.deployable, name: 'environment_cancel_deployment_jobs') do |deployable|
deployable.cancel! if deployable&.cancelable? deployable.cancel! if deployable&.cancelable?
end end
rescue => e rescue => e
......
...@@ -43,7 +43,7 @@ module Ci ...@@ -43,7 +43,7 @@ module Ci
private private
def update_bridge_status!(bridge, pipeline) def update_bridge_status!(bridge, pipeline)
Gitlab::OptimisticLocking.retry_lock(bridge) do |subject| Gitlab::OptimisticLocking.retry_lock(bridge, name: 'create_downstream_pipeline_update_bridge_status') do |subject|
if pipeline.created_successfully? if pipeline.created_successfully?
# If bridge uses `strategy:depend` we leave it running # If bridge uses `strategy:depend` we leave it running
# and update the status when the downstream pipeline completes. # and update the status when the downstream pipeline completes.
......
...@@ -80,7 +80,7 @@ module Ci ...@@ -80,7 +80,7 @@ module Ci
return unless Ci::HasStatus::COMPLETED_STATUSES.include?(status) return unless Ci::HasStatus::COMPLETED_STATUSES.include?(status)
# transition status if possible # transition status if possible
Gitlab::OptimisticLocking.retry_lock(processable) do |subject| Gitlab::OptimisticLocking.retry_lock(processable, name: 'atomic_processing_update_processable') do |subject|
Ci::ProcessBuildService.new(project, subject.user) Ci::ProcessBuildService.new(project, subject.user)
.execute(subject, status) .execute(subject, status)
......
...@@ -152,7 +152,7 @@ module Ci ...@@ -152,7 +152,7 @@ module Ci
end end
def scheduler_failure!(build) def scheduler_failure!(build)
Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject| Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
subject.drop!(:scheduler_failure) subject.drop!(:scheduler_failure)
end end
rescue => ex rescue => ex
......
...@@ -19,7 +19,7 @@ module Ci ...@@ -19,7 +19,7 @@ module Ci
mark_subsequent_stages_as_processable(build) mark_subsequent_stages_as_processable(build)
build.pipeline.reset_ancestor_bridges! build.pipeline.reset_ancestor_bridges!
Gitlab::OptimisticLocking.retry_lock(new_build, &:enqueue) Gitlab::OptimisticLocking.retry_lock(new_build, name: 'retry_build', &:enqueue)
MergeRequests::AddTodoWhenBuildFailsService MergeRequests::AddTodoWhenBuildFailsService
.new(project, current_user) .new(project, current_user)
...@@ -68,7 +68,7 @@ module Ci ...@@ -68,7 +68,7 @@ module Ci
def mark_subsequent_stages_as_processable(build) def mark_subsequent_stages_as_processable(build)
build.pipeline.processables.skipped.after_stage(build.stage_idx).find_each do |skipped| build.pipeline.processables.skipped.after_stage(build.stage_idx).find_each do |skipped|
retry_optimistic_lock(skipped) { |build| build.process(current_user) } retry_optimistic_lock(skipped, name: 'ci_retry_build_mark_subsequent_stages') { |build| build.process(current_user) }
end end
end end
end end
......
...@@ -23,7 +23,7 @@ module Ci ...@@ -23,7 +23,7 @@ module Ci
end end
pipeline.builds.latest.skipped.find_each do |skipped| pipeline.builds.latest.skipped.find_each do |skipped|
retry_optimistic_lock(skipped) { |build| build.process(current_user) } retry_optimistic_lock(skipped, name: 'ci_retry_pipeline') { |build| build.process(current_user) }
end end
pipeline.reset_ancestor_bridges! pipeline.reset_ancestor_bridges!
......
...@@ -12,7 +12,7 @@ module Deployments ...@@ -12,7 +12,7 @@ module Deployments
return unless @deployment&.running? return unless @deployment&.running?
older_deployments.find_each do |older_deployment| older_deployments.find_each do |older_deployment|
Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable) do |deployable| Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable, name: 'older_deployments_drop') do |deployable|
deployable.drop(:forward_deployment_failure) deployable.drop(:forward_deployment_failure)
end end
rescue => e rescue => e
......
...@@ -60,7 +60,7 @@ module Terraform ...@@ -60,7 +60,7 @@ module Terraform
private private
def retrieve_with_lock(find_only: false) def retrieve_with_lock(find_only: false)
create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state) { |state| yield state } } create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state, name: 'terraform_remote_state_handler_retrieve') { |state| yield state } }
end end
def create_or_find!(find_only:) def create_or_find!(find_only:)
......
...@@ -70,7 +70,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -70,7 +70,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
def drop_build(type, build, status, timeout, reason) def drop_build(type, build, status, timeout, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})" Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3) do |b| Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason) b.drop(reason)
end end
rescue => ex rescue => ex
......
---
title: Log Optimistic Locks with retries
merge_request: 55187
author:
type: other
...@@ -25,7 +25,7 @@ module EE ...@@ -25,7 +25,7 @@ module EE
def perform! def perform!
return unless limit.exceeded? return unless limit.exceeded?
retry_optimistic_lock(pipeline) do retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_activity') do
pipeline.drop!(:activity_limit_exceeded) pipeline.drop!(:activity_limit_exceeded)
limit.log_error!(project_id: project.id, plan: project.actual_plan_name) limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
end end
......
...@@ -25,7 +25,7 @@ module EE ...@@ -25,7 +25,7 @@ module EE
def perform! def perform!
return unless limit.exceeded? return unless limit.exceeded?
retry_optimistic_lock(pipeline) do retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_job_activity') do
pipeline.drop!(:job_activity_limit_exceeded) pipeline.drop!(:job_activity_limit_exceeded)
limit.log_error!(project_id: project.id, plan: project.actual_plan_name) limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
end end
......
...@@ -10,7 +10,7 @@ module Gitlab ...@@ -10,7 +10,7 @@ module Gitlab
def perform! def perform!
return unless project.auto_cancel_pending_pipelines? return unless project.auto_cancel_pending_pipelines?
Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines) do |cancelables| Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines, name: 'cancel_pending_pipelines') do |cancelables|
cancelables.find_each do |cancelable| cancelables.find_each do |cancelable|
cancelable.auto_cancel_running(pipeline) cancelable.auto_cancel_running(pipeline)
end end
......
...@@ -2,22 +2,49 @@ ...@@ -2,22 +2,49 @@
module Gitlab module Gitlab
module OptimisticLocking module OptimisticLocking
MAX_RETRIES = 100
module_function module_function
def retry_lock(subject, retries = nil, &block) def retry_lock(subject, max_retries = MAX_RETRIES, name:, &block)
retries ||= 100 start_time = Gitlab::Metrics::System.monotonic_time
# TODO(Observability): We should be recording details of the number of retries and the duration of the total execution here retry_attempts = 0
ActiveRecord::Base.transaction do
yield(subject) begin
end ActiveRecord::Base.transaction do
rescue ActiveRecord::StaleObjectError yield(subject)
retries -= 1 end
raise unless retries >= 0 rescue ActiveRecord::StaleObjectError
raise unless retry_attempts < max_retries
subject.reset subject.reset
retry
retry_attempts += 1
retry
ensure
elapsed_time = Gitlab::Metrics::System.monotonic_time - start_time
log_optimistic_lock_retries(
name: name,
retry_attempts: retry_attempts,
elapsed_time: elapsed_time)
end
end end
alias_method :retry_optimistic_lock, :retry_lock alias_method :retry_optimistic_lock, :retry_lock
def log_optimistic_lock_retries(name:, retry_attempts:, elapsed_time:)
return unless retry_attempts > 0
retry_lock_logger.info(
message: "Optimistic Lock released with retries",
name: name,
retries: retry_attempts,
time_s: elapsed_time)
end
def retry_lock_logger
@retry_lock_logger ||= Gitlab::Services::Logger.build
end
end end
end end
...@@ -7,35 +7,81 @@ RSpec.describe Gitlab::OptimisticLocking do ...@@ -7,35 +7,81 @@ RSpec.describe Gitlab::OptimisticLocking do
let!(:pipeline2) { Ci::Pipeline.find(pipeline.id) } let!(:pipeline2) { Ci::Pipeline.find(pipeline.id) }
describe '#retry_lock' do describe '#retry_lock' do
it 'does not reload object if state changes' do let(:name) { 'optimistic_locking_spec' }
expect(pipeline).not_to receive(:reset)
expect(pipeline).to receive(:succeed).and_call_original
described_class.retry_lock(pipeline) do |subject| context 'when state changed successfully without retries' do
subject.succeed subject do
described_class.retry_lock(pipeline, name: name) do |lock_subject|
lock_subject.succeed
end
end end
end
it 'retries action if exception is raised' do it 'does not reload object' do
pipeline.succeed expect(pipeline).not_to receive(:reset)
expect(pipeline).to receive(:succeed).and_call_original
subject
end
expect(pipeline2).to receive(:reset).and_call_original it 'does not create log record' do
expect(pipeline2).to receive(:drop).twice.and_call_original expect(described_class.retry_lock_logger).not_to receive(:info)
described_class.retry_lock(pipeline2) do |subject| subject
subject.drop
end end
end end
it 'raises exception when too many retries' do context 'when at least one retry happened, the change succeeded' do
expect(pipeline).to receive(:drop).twice.and_call_original subject do
described_class.retry_lock(pipeline2, name: 'optimistic_locking_spec') do |lock_subject|
lock_subject.drop
end
end
it 'completes the action' do
pipeline.succeed
expect(pipeline2).to receive(:reset).and_call_original
expect(pipeline2).to receive(:drop).twice.and_call_original
subject
end
it 'creates a single log record' do
pipeline.succeed
expect(described_class.retry_lock_logger)
.to receive(:info)
.once
.with(hash_including(:time_s, name: name, retries: 1))
expect do subject
described_class.retry_lock(pipeline, 1) do |subject| end
subject.lock_version = 100 end
subject.drop
context 'when MAX_RETRIES attempts exceeded' do
subject do
described_class.retry_lock(pipeline, max_retries, name: name) do |lock_subject|
lock_subject.lock_version = 100
lock_subject.drop
end end
end.to raise_error(ActiveRecord::StaleObjectError) end
let(:max_retries) { 2 }
it 'raises an exception' do
expect(pipeline).to receive(:drop).exactly(max_retries + 1).times.and_call_original
expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
end
it 'creates a single log record' do
expect(described_class.retry_lock_logger)
.to receive(:info)
.once
.with(hash_including(:time_s, name: name, retries: max_retries))
expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment