Commit ac847fba authored by Aleksei Lipniagov's avatar Aleksei Lipniagov Committed by Bob Van Landuyt

Log Optimistic Locks with retries

Use structured logging for gathering info on Optimistic Locks which were
not able to succeed without retries.
parent e8043280
......@@ -573,7 +573,7 @@ module Ci
end
def cancel_running(retries: nil)
retry_optimistic_lock(cancelable_statuses, retries) do |cancelable|
retry_optimistic_lock(cancelable_statuses, retries, name: 'ci_pipeline_cancel_running') do |cancelable|
cancelable.find_each do |job|
yield(job) if block_given?
job.cancel
......@@ -744,7 +744,7 @@ module Ci
end
def set_status(new_status)
retry_optimistic_lock(self) do
retry_optimistic_lock(self, name: 'ci_pipeline_set_status') do
case new_status
when 'created' then nil
when 'waiting_for_resource' then request_resource
......
......@@ -62,7 +62,7 @@ module Ci
end
def update_status_by!(pipeline)
retry_lock(self) do
retry_lock(self, name: 'ci_ref_update_status_by') do
next unless last_finished_pipeline_id == pipeline.id
case pipeline.status
......
......@@ -84,7 +84,7 @@ module Ci
end
def set_status(new_status)
retry_optimistic_lock(self) do
retry_optimistic_lock(self, name: 'ci_stage_set_status') do
case new_status
when 'created' then nil
when 'waiting_for_resource' then request_resource
......
......@@ -242,7 +242,7 @@ class Environment < ApplicationRecord
def cancel_deployment_jobs!
jobs = active_deployments.with_deployable
jobs.each do |deployment|
Gitlab::OptimisticLocking.retry_lock(deployment.deployable) do |deployable|
Gitlab::OptimisticLocking.retry_lock(deployment.deployable, name: 'environment_cancel_deployment_jobs') do |deployable|
deployable.cancel! if deployable&.cancelable?
end
rescue => e
......
......@@ -43,7 +43,7 @@ module Ci
private
def update_bridge_status!(bridge, pipeline)
Gitlab::OptimisticLocking.retry_lock(bridge) do |subject|
Gitlab::OptimisticLocking.retry_lock(bridge, name: 'create_downstream_pipeline_update_bridge_status') do |subject|
if pipeline.created_successfully?
# If bridge uses `strategy:depend` we leave it running
# and update the status when the downstream pipeline completes.
......
......@@ -80,7 +80,7 @@ module Ci
return unless Ci::HasStatus::COMPLETED_STATUSES.include?(status)
# transition status if possible
Gitlab::OptimisticLocking.retry_lock(processable) do |subject|
Gitlab::OptimisticLocking.retry_lock(processable, name: 'atomic_processing_update_processable') do |subject|
Ci::ProcessBuildService.new(project, subject.user)
.execute(subject, status)
......
......@@ -152,7 +152,7 @@ module Ci
end
def scheduler_failure!(build)
Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject|
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
subject.drop!(:scheduler_failure)
end
rescue => ex
......
......@@ -19,7 +19,7 @@ module Ci
mark_subsequent_stages_as_processable(build)
build.pipeline.reset_ancestor_bridges!
Gitlab::OptimisticLocking.retry_lock(new_build, &:enqueue)
Gitlab::OptimisticLocking.retry_lock(new_build, name: 'retry_build', &:enqueue)
MergeRequests::AddTodoWhenBuildFailsService
.new(project, current_user)
......@@ -68,7 +68,7 @@ module Ci
def mark_subsequent_stages_as_processable(build)
build.pipeline.processables.skipped.after_stage(build.stage_idx).find_each do |skipped|
retry_optimistic_lock(skipped) { |build| build.process(current_user) }
retry_optimistic_lock(skipped, name: 'ci_retry_build_mark_subsequent_stages') { |build| build.process(current_user) }
end
end
end
......
......@@ -23,7 +23,7 @@ module Ci
end
pipeline.builds.latest.skipped.find_each do |skipped|
retry_optimistic_lock(skipped) { |build| build.process(current_user) }
retry_optimistic_lock(skipped, name: 'ci_retry_pipeline') { |build| build.process(current_user) }
end
pipeline.reset_ancestor_bridges!
......
......@@ -12,7 +12,7 @@ module Deployments
return unless @deployment&.running?
older_deployments.find_each do |older_deployment|
Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable) do |deployable|
Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable, name: 'older_deployments_drop') do |deployable|
deployable.drop(:forward_deployment_failure)
end
rescue => e
......
......@@ -60,7 +60,7 @@ module Terraform
private
def retrieve_with_lock(find_only: false)
create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state) { |state| yield state } }
create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state, name: 'terraform_remote_state_handler_retrieve') { |state| yield state } }
end
def create_or_find!(find_only:)
......
......@@ -70,7 +70,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
def drop_build(type, build, status, timeout, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
rescue => ex
......
---
title: Log Optimistic Locks with retries
merge_request: 55187
author:
type: other
......@@ -25,7 +25,7 @@ module EE
def perform!
return unless limit.exceeded?
retry_optimistic_lock(pipeline) do
retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_activity') do
pipeline.drop!(:activity_limit_exceeded)
limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
end
......
......@@ -25,7 +25,7 @@ module EE
def perform!
return unless limit.exceeded?
retry_optimistic_lock(pipeline) do
retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_job_activity') do
pipeline.drop!(:job_activity_limit_exceeded)
limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
end
......
......@@ -10,7 +10,7 @@ module Gitlab
def perform!
return unless project.auto_cancel_pending_pipelines?
Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines) do |cancelables|
Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines, name: 'cancel_pending_pipelines') do |cancelables|
cancelables.find_each do |cancelable|
cancelable.auto_cancel_running(pipeline)
end
......
......@@ -2,22 +2,49 @@
module Gitlab
module OptimisticLocking
MAX_RETRIES = 100
module_function
def retry_lock(subject, retries = nil, &block)
retries ||= 100
# TODO(Observability): We should be recording details of the number of retries and the duration of the total execution here
def retry_lock(subject, max_retries = MAX_RETRIES, name:, &block)
start_time = Gitlab::Metrics::System.monotonic_time
retry_attempts = 0
begin
ActiveRecord::Base.transaction do
yield(subject)
end
rescue ActiveRecord::StaleObjectError
retries -= 1
raise unless retries >= 0
raise unless retry_attempts < max_retries
subject.reset
retry_attempts += 1
retry
ensure
elapsed_time = Gitlab::Metrics::System.monotonic_time - start_time
log_optimistic_lock_retries(
name: name,
retry_attempts: retry_attempts,
elapsed_time: elapsed_time)
end
end
alias_method :retry_optimistic_lock, :retry_lock
def log_optimistic_lock_retries(name:, retry_attempts:, elapsed_time:)
return unless retry_attempts > 0
retry_lock_logger.info(
message: "Optimistic Lock released with retries",
name: name,
retries: retry_attempts,
time_s: elapsed_time)
end
def retry_lock_logger
@retry_lock_logger ||= Gitlab::Services::Logger.build
end
end
end
......@@ -7,35 +7,81 @@ RSpec.describe Gitlab::OptimisticLocking do
let!(:pipeline2) { Ci::Pipeline.find(pipeline.id) }
describe '#retry_lock' do
it 'does not reload object if state changes' do
let(:name) { 'optimistic_locking_spec' }
context 'when state changed successfully without retries' do
subject do
described_class.retry_lock(pipeline, name: name) do |lock_subject|
lock_subject.succeed
end
end
it 'does not reload object' do
expect(pipeline).not_to receive(:reset)
expect(pipeline).to receive(:succeed).and_call_original
described_class.retry_lock(pipeline) do |subject|
subject.succeed
subject
end
it 'does not create log record' do
expect(described_class.retry_lock_logger).not_to receive(:info)
subject
end
end
it 'retries action if exception is raised' do
context 'when at least one retry happened, the change succeeded' do
subject do
described_class.retry_lock(pipeline2, name: 'optimistic_locking_spec') do |lock_subject|
lock_subject.drop
end
end
it 'completes the action' do
pipeline.succeed
expect(pipeline2).to receive(:reset).and_call_original
expect(pipeline2).to receive(:drop).twice.and_call_original
described_class.retry_lock(pipeline2) do |subject|
subject.drop
subject
end
it 'creates a single log record' do
pipeline.succeed
expect(described_class.retry_lock_logger)
.to receive(:info)
.once
.with(hash_including(:time_s, name: name, retries: 1))
subject
end
end
context 'when MAX_RETRIES attempts exceeded' do
subject do
described_class.retry_lock(pipeline, max_retries, name: name) do |lock_subject|
lock_subject.lock_version = 100
lock_subject.drop
end
end
let(:max_retries) { 2 }
it 'raises an exception' do
expect(pipeline).to receive(:drop).exactly(max_retries + 1).times.and_call_original
expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
end
it 'raises exception when too many retries' do
expect(pipeline).to receive(:drop).twice.and_call_original
it 'creates a single log record' do
expect(described_class.retry_lock_logger)
.to receive(:info)
.once
.with(hash_including(:time_s, name: name, retries: max_retries))
expect do
described_class.retry_lock(pipeline, 1) do |subject|
subject.lock_version = 100
subject.drop
expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
end
end.to raise_error(ActiveRecord::StaleObjectError)
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment