Log Optimistic Locks with retries

Use structured logging for gathering info on Optimistic Locks which were not able to succeed without retries.

Log Optimistic Locks with retries
Use structured logging for gathering info on Optimistic Locks which were not able to succeed without retries.
ac847fba · Aleksei Lipniagov · Bob Van Landuyt · e8043280 · ac847fba · ac847fba
Commit ac847fba authored Mar 02, 2021 by Aleksei Lipniagov Committed by Bob Van Landuyt Mar 02, 2021
18 changed files
--- a/app/models/ci/pipeline.rb
+++ b/app/models/ci/pipeline.rb
@@ -573,7 +573,7 @@ module Ci
    end
    def cancel_running(retries: nil)
-      retry_optimistic_lock(cancelable_statuses, retries) do |cancelable|
+      retry_optimistic_lock(cancelable_statuses, retries, name: 'ci_pipeline_cancel_running') do |cancelable|
        cancelable.find_each do |job|
          yield(job) if block_given?
          job.cancel
@@ -744,7 +744,7 @@ module Ci
    end
    def set_status(new_status)
-      retry_optimistic_lock(self) do
+      retry_optimistic_lock(self, name: 'ci_pipeline_set_status') do
        case new_status
        when 'created' then nil
        when 'waiting_for_resource' then request_resource

--- a/app/models/ci/ref.rb
+++ b/app/models/ci/ref.rb
@@ -62,7 +62,7 @@ module Ci
    end
    def update_status_by!(pipeline)
-      retry_lock(self) do
+      retry_lock(self, name: 'ci_ref_update_status_by') do
        next unless last_finished_pipeline_id == pipeline.id
        case pipeline.status

--- a/app/models/ci/stage.rb
+++ b/app/models/ci/stage.rb
@@ -84,7 +84,7 @@ module Ci
    end
    def set_status(new_status)
-      retry_optimistic_lock(self) do
+      retry_optimistic_lock(self, name: 'ci_stage_set_status') do
        case new_status
        when 'created' then nil
        when 'waiting_for_resource' then request_resource

--- a/app/models/environment.rb
+++ b/app/models/environment.rb
@@ -242,7 +242,7 @@ class Environment < ApplicationRecord
  def cancel_deployment_jobs!
    jobs = active_deployments.with_deployable
    jobs.each do |deployment|
-      Gitlab::OptimisticLocking.retry_lock(deployment.deployable) do |deployable|
+      Gitlab::OptimisticLocking.retry_lock(deployment.deployable, name: 'environment_cancel_deployment_jobs') do |deployable|
        deployable.cancel! if deployable&.cancelable?
      end
    rescue => e

--- a/app/services/ci/create_downstream_pipeline_service.rb
+++ b/app/services/ci/create_downstream_pipeline_service.rb
@@ -43,7 +43,7 @@ module Ci
    private
    def update_bridge_status!(bridge, pipeline)
-      Gitlab::OptimisticLocking.retry_lock(bridge) do |subject|
+      Gitlab::OptimisticLocking.retry_lock(bridge, name: 'create_downstream_pipeline_update_bridge_status') do |subject|
        if pipeline.created_successfully?
          # If bridge uses `strategy:depend` we leave it running
          # and update the status when the downstream pipeline completes.

--- a/app/services/ci/pipeline_processing/atomic_processing_service.rb
+++ b/app/services/ci/pipeline_processing/atomic_processing_service.rb
@@ -80,7 +80,7 @@ module Ci
        return unless Ci::HasStatus::COMPLETED_STATUSES.include?(status)
        # transition status if possible
-        Gitlab::OptimisticLocking.retry_lock(processable) do |subject|
+        Gitlab::OptimisticLocking.retry_lock(processable, name: 'atomic_processing_update_processable') do |subject|
          Ci::ProcessBuildService.new(project, subject.user)
            .execute(subject, status)

--- a/app/services/ci/register_job_service.rb
+++ b/app/services/ci/register_job_service.rb
@@ -152,7 +152,7 @@ module Ci
    end
    def scheduler_failure!(build)
-      Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject|
+      Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
        subject.drop!(:scheduler_failure)
      end
    rescue => ex

--- a/app/services/ci/retry_build_service.rb
+++ b/app/services/ci/retry_build_service.rb
@@ -19,7 +19,7 @@ module Ci
        mark_subsequent_stages_as_processable(build)
        build.pipeline.reset_ancestor_bridges!
-        Gitlab::OptimisticLocking.retry_lock(new_build, &:enqueue)
+        Gitlab::OptimisticLocking.retry_lock(new_build, name: 'retry_build', &:enqueue)
        MergeRequests::AddTodoWhenBuildFailsService
          .new(project, current_user)
@@ -68,7 +68,7 @@ module Ci
    def mark_subsequent_stages_as_processable(build)
      build.pipeline.processables.skipped.after_stage(build.stage_idx).find_each do |skipped|
-        retry_optimistic_lock(skipped) { |build| build.process(current_user) }
+        retry_optimistic_lock(skipped, name: 'ci_retry_build_mark_subsequent_stages') { |build| build.process(current_user) }
      end
    end
  end

--- a/app/services/ci/retry_pipeline_service.rb
+++ b/app/services/ci/retry_pipeline_service.rb
@@ -23,7 +23,7 @@ module Ci
      end
      pipeline.builds.latest.skipped.find_each do |skipped|
-        retry_optimistic_lock(skipped) { |build| build.process(current_user) }
+        retry_optimistic_lock(skipped, name: 'ci_retry_pipeline') { |build| build.process(current_user) }
      end
      pipeline.reset_ancestor_bridges!

--- a/app/services/deployments/older_deployments_drop_service.rb
+++ b/app/services/deployments/older_deployments_drop_service.rb
@@ -12,7 +12,7 @@ module Deployments
      return unless @deployment&.running?
      older_deployments.find_each do |older_deployment|
-        Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable) do |deployable|
+        Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable, name: 'older_deployments_drop') do |deployable|
          deployable.drop(:forward_deployment_failure)
        end
      rescue => e

--- a/app/services/terraform/remote_state_handler.rb
+++ b/app/services/terraform/remote_state_handler.rb
@@ -60,7 +60,7 @@ module Terraform
    private
    def retrieve_with_lock(find_only: false)
-      create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state) { |state| yield state } }
+      create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state, name: 'terraform_remote_state_handler_retrieve') { |state| yield state } }
    end
    def create_or_find!(find_only:)

--- a/app/workers/stuck_ci_jobs_worker.rb
+++ b/app/workers/stuck_ci_jobs_worker.rb
@@ -70,7 +70,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
  def drop_build(type, build, status, timeout, reason)
    Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})"
-    Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
+    Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
      b.drop(reason)
    end
  rescue => ex

--- a/changelogs/unreleased/322125-log-optimistic-locking-retries.yml
+++ b/changelogs/unreleased/322125-log-optimistic-locking-retries.yml
+---
+title: Log Optimistic Locks with retries
+merge_request: 55187
+author:
+type: other
--- a/ee/lib/ee/gitlab/ci/pipeline/chain/limit/activity.rb
+++ b/ee/lib/ee/gitlab/ci/pipeline/chain/limit/activity.rb
@@ -25,7 +25,7 @@ module EE
              def perform!
                return unless limit.exceeded?
-                retry_optimistic_lock(pipeline) do
+                retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_activity') do
                  pipeline.drop!(:activity_limit_exceeded)
                  limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
                end

--- a/ee/lib/ee/gitlab/ci/pipeline/chain/limit/job_activity.rb
+++ b/ee/lib/ee/gitlab/ci/pipeline/chain/limit/job_activity.rb
@@ -25,7 +25,7 @@ module EE
              def perform!
                return unless limit.exceeded?
-                retry_optimistic_lock(pipeline) do
+                retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_job_activity') do
                  pipeline.drop!(:job_activity_limit_exceeded)
                  limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
                end

--- a/lib/gitlab/ci/pipeline/chain/cancel_pending_pipelines.rb
+++ b/lib/gitlab/ci/pipeline/chain/cancel_pending_pipelines.rb
@@ -10,7 +10,7 @@ module Gitlab
          def perform!
            return unless project.auto_cancel_pending_pipelines?
-            Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines) do |cancelables|
+            Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines, name: 'cancel_pending_pipelines') do |cancelables|
              cancelables.find_each do |cancelable|
                cancelable.auto_cancel_running(pipeline)
              end

--- a/lib/gitlab/optimistic_locking.rb
+++ b/lib/gitlab/optimistic_locking.rb
@@ -2,22 +2,49 @@
 module Gitlab
  module OptimisticLocking
+    MAX_RETRIES = 100
    module_function
-    def retry_lock(subject, retries = nil, &block)
+    def retry_lock(subject, max_retries = MAX_RETRIES, name:, &block)
-      retries ||= 100
+      start_time = Gitlab::Metrics::System.monotonic_time
-      # TODO(Observability): We should be recording details of the number of retries and the duration of the total execution here
+      retry_attempts = 0
-      ActiveRecord::Base.transaction do
-        yield(subject)
+      begin
-      end
+        ActiveRecord::Base.transaction do
-    rescue ActiveRecord::StaleObjectError
+          yield(subject)
-      retries -= 1
+        end
-      raise unless retries >= 0
+      rescue ActiveRecord::StaleObjectError
+        raise unless retry_attempts < max_retries
-      subject.reset
+        subject.reset
-      retry
+        retry_attempts += 1
+        retry
+      ensure
+        elapsed_time = Gitlab::Metrics::System.monotonic_time - start_time
+        log_optimistic_lock_retries(
+          name: name,
+          retry_attempts: retry_attempts,
+          elapsed_time: elapsed_time)
+      end
    end
    alias_method :retry_optimistic_lock, :retry_lock
+    def log_optimistic_lock_retries(name:, retry_attempts:, elapsed_time:)
+      return unless retry_attempts > 0
+      retry_lock_logger.info(
+        message: "Optimistic Lock released with retries",
+        name: name,
+        retries: retry_attempts,
+        time_s: elapsed_time)
+    end
+    def retry_lock_logger
+      @retry_lock_logger ||= Gitlab::Services::Logger.build
+    end
  end
 end
--- a/spec/lib/gitlab/optimistic_locking_spec.rb
+++ b/spec/lib/gitlab/optimistic_locking_spec.rb
@@ -7,35 +7,81 @@ RSpec.describe Gitlab::OptimisticLocking do
  let!(:pipeline2) { Ci::Pipeline.find(pipeline.id) }
  describe '#retry_lock' do
-    it 'does not reload object if state changes' do
+    let(:name) { 'optimistic_locking_spec' }
-      expect(pipeline).not_to receive(:reset)
-      expect(pipeline).to receive(:succeed).and_call_original
-      described_class.retry_lock(pipeline) do |subject|
+    context 'when state changed successfully without retries' do
-        subject.succeed
+      subject do
+        described_class.retry_lock(pipeline, name: name) do |lock_subject|
+          lock_subject.succeed
+        end
      end
-    end
-    it 'retries action if exception is raised' do
+      it 'does not reload object' do
-      pipeline.succeed
+        expect(pipeline).not_to receive(:reset)
+        expect(pipeline).to receive(:succeed).and_call_original
+        subject
+      end
-      expect(pipeline2).to receive(:reset).and_call_original
+      it 'does not create log record' do
-      expect(pipeline2).to receive(:drop).twice.and_call_original
+        expect(described_class.retry_lock_logger).not_to receive(:info)
-      described_class.retry_lock(pipeline2) do |subject|
+        subject
-        subject.drop
      end
    end
-    it 'raises exception when too many retries' do
+    context 'when at least one retry happened, the change succeeded' do
-      expect(pipeline).to receive(:drop).twice.and_call_original
+      subject do
+        described_class.retry_lock(pipeline2, name: 'optimistic_locking_spec') do |lock_subject|
+          lock_subject.drop
+        end
+      end
+      it 'completes the action' do
+        pipeline.succeed
+        expect(pipeline2).to receive(:reset).and_call_original
+        expect(pipeline2).to receive(:drop).twice.and_call_original
+        subject
+      end
+      it 'creates a single log record' do
+        pipeline.succeed
+        expect(described_class.retry_lock_logger)
+          .to receive(:info)
+          .once
+          .with(hash_including(:time_s, name: name, retries: 1))
-      expect do
+        subject
-        described_class.retry_lock(pipeline, 1) do |subject|
+      end
-          subject.lock_version = 100
+    end
-          subject.drop
+    context 'when MAX_RETRIES attempts exceeded' do
+      subject do
+        described_class.retry_lock(pipeline, max_retries, name: name) do |lock_subject|
+          lock_subject.lock_version = 100
+          lock_subject.drop
        end
-      end.to raise_error(ActiveRecord::StaleObjectError)
+      end
+      let(:max_retries) { 2 }
+      it 'raises an exception' do
+        expect(pipeline).to receive(:drop).exactly(max_retries + 1).times.and_call_original
+        expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
+      end
+      it 'creates a single log record' do
+        expect(described_class.retry_lock_logger)
+          .to receive(:info)
+          .once
+          .with(hash_including(:time_s, name: name, retries: max_retries))
+        expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
+      end
    end
  end