Log Optimistic Locks with retries

Use structured logging for gathering info on Optimistic Locks which were not able to succeed without retries.

Log Optimistic Locks with retries
Use structured logging for gathering info on Optimistic Locks which were not able to succeed without retries.
ac847fba · Aleksei Lipniagov · Bob Van Landuyt · e8043280 · ac847fba · ac847fba
Commit ac847fba authored Mar 02, 2021 by Aleksei Lipniagov Committed by Bob Van Landuyt Mar 02, 2021
18 changed files
--- a/app/models/ci/pipeline.rb
+++ b/app/models/ci/pipeline.rb
@@ -573,7 +573,7 @@ module Ci
    end

    def cancel_running(retries: nil)
-      retry_optimistic_lock(cancelable_statuses, retries) do |cancelable|
+      retry_optimistic_lock(cancelable_statuses, retries, name: 'ci_pipeline_cancel_running') do |cancelable|
        cancelable.find_each do |job|
          yield(job) if block_given?
          job.cancel
@@ -744,7 +744,7 @@ module Ci
    end

    def set_status(new_status)
-      retry_optimistic_lock(self) do
+      retry_optimistic_lock(self, name: 'ci_pipeline_set_status') do
        case new_status
        when 'created' then nil
        when 'waiting_for_resource' then request_resource

--- a/app/models/ci/ref.rb
+++ b/app/models/ci/ref.rb
@@ -62,7 +62,7 @@ module Ci
    end

    def update_status_by!(pipeline)
-      retry_lock(self) do
+      retry_lock(self, name: 'ci_ref_update_status_by') do
        next unless last_finished_pipeline_id == pipeline.id

        case pipeline.status

--- a/app/models/ci/stage.rb
+++ b/app/models/ci/stage.rb
@@ -84,7 +84,7 @@ module Ci
    end

    def set_status(new_status)
-      retry_optimistic_lock(self) do
+      retry_optimistic_lock(self, name: 'ci_stage_set_status') do
        case new_status
        when 'created' then nil
        when 'waiting_for_resource' then request_resource

--- a/app/models/environment.rb
+++ b/app/models/environment.rb
@@ -242,7 +242,7 @@ class Environment < ApplicationRecord
  def cancel_deployment_jobs!
    jobs = active_deployments.with_deployable
    jobs.each do |deployment|
-      Gitlab::OptimisticLocking.retry_lock(deployment.deployable) do |deployable|
+      Gitlab::OptimisticLocking.retry_lock(deployment.deployable, name: 'environment_cancel_deployment_jobs') do |deployable|
        deployable.cancel! if deployable&.cancelable?
      end
    rescue => e

--- a/app/services/ci/create_downstream_pipeline_service.rb
+++ b/app/services/ci/create_downstream_pipeline_service.rb
@@ -43,7 +43,7 @@ module Ci
    private

    def update_bridge_status!(bridge, pipeline)
-      Gitlab::OptimisticLocking.retry_lock(bridge) do |subject|
+      Gitlab::OptimisticLocking.retry_lock(bridge, name: 'create_downstream_pipeline_update_bridge_status') do |subject|
        if pipeline.created_successfully?
          # If bridge uses `strategy:depend` we leave it running
          # and update the status when the downstream pipeline completes.

--- a/app/services/ci/pipeline_processing/atomic_processing_service.rb
+++ b/app/services/ci/pipeline_processing/atomic_processing_service.rb
@@ -80,7 +80,7 @@ module Ci
        return unless Ci::HasStatus::COMPLETED_STATUSES.include?(status)

        # transition status if possible
-        Gitlab::OptimisticLocking.retry_lock(processable) do |subject|
+        Gitlab::OptimisticLocking.retry_lock(processable, name: 'atomic_processing_update_processable') do |subject|
          Ci::ProcessBuildService.new(project, subject.user)
            .execute(subject, status)


--- a/app/services/ci/register_job_service.rb
+++ b/app/services/ci/register_job_service.rb
@@ -152,7 +152,7 @@ module Ci
    end

    def scheduler_failure!(build)
-      Gitlab::OptimisticLocking.retry_lock(build, 3) do |subject|
+      Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'register_job_scheduler_failure') do |subject|
        subject.drop!(:scheduler_failure)
      end
    rescue => ex

--- a/app/services/ci/retry_build_service.rb
+++ b/app/services/ci/retry_build_service.rb
@@ -19,7 +19,7 @@ module Ci
        mark_subsequent_stages_as_processable(build)
        build.pipeline.reset_ancestor_bridges!

-        Gitlab::OptimisticLocking.retry_lock(new_build, &:enqueue)
+        Gitlab::OptimisticLocking.retry_lock(new_build, name: 'retry_build', &:enqueue)

        MergeRequests::AddTodoWhenBuildFailsService
          .new(project, current_user)
@@ -68,7 +68,7 @@ module Ci

    def mark_subsequent_stages_as_processable(build)
      build.pipeline.processables.skipped.after_stage(build.stage_idx).find_each do |skipped|
-        retry_optimistic_lock(skipped) { |build| build.process(current_user) }
+        retry_optimistic_lock(skipped, name: 'ci_retry_build_mark_subsequent_stages') { |build| build.process(current_user) }
      end
    end
  end

--- a/app/services/ci/retry_pipeline_service.rb
+++ b/app/services/ci/retry_pipeline_service.rb
@@ -23,7 +23,7 @@ module Ci
      end

      pipeline.builds.latest.skipped.find_each do |skipped|
-        retry_optimistic_lock(skipped) { |build| build.process(current_user) }
+        retry_optimistic_lock(skipped, name: 'ci_retry_pipeline') { |build| build.process(current_user) }
      end

      pipeline.reset_ancestor_bridges!

--- a/app/services/deployments/older_deployments_drop_service.rb
+++ b/app/services/deployments/older_deployments_drop_service.rb
@@ -12,7 +12,7 @@ module Deployments
      return unless @deployment&.running?

      older_deployments.find_each do |older_deployment|
-        Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable) do |deployable|
+        Gitlab::OptimisticLocking.retry_lock(older_deployment.deployable, name: 'older_deployments_drop') do |deployable|
          deployable.drop(:forward_deployment_failure)
        end
      rescue => e

--- a/app/services/terraform/remote_state_handler.rb
+++ b/app/services/terraform/remote_state_handler.rb
@@ -60,7 +60,7 @@ module Terraform
    private

    def retrieve_with_lock(find_only: false)
-      create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state) { |state| yield state } }
+      create_or_find!(find_only: find_only).tap { |state| retry_optimistic_lock(state, name: 'terraform_remote_state_handler_retrieve') { |state| yield state } }
    end

    def create_or_find!(find_only:)

--- a/app/workers/stuck_ci_jobs_worker.rb
+++ b/app/workers/stuck_ci_jobs_worker.rb
@@ -70,7 +70,7 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker

  def drop_build(type, build, status, timeout, reason)
    Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout}, reason: #{reason})"
-    Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
+    Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
      b.drop(reason)
    end
  rescue => ex

--- a/changelogs/unreleased/322125-log-optimistic-locking-retries.yml
+++ b/changelogs/unreleased/322125-log-optimistic-locking-retries.yml
+---
+title: Log Optimistic Locks with retries
+merge_request: 55187
+author:
+type: other
--- a/ee/lib/ee/gitlab/ci/pipeline/chain/limit/activity.rb
+++ b/ee/lib/ee/gitlab/ci/pipeline/chain/limit/activity.rb
@@ -25,7 +25,7 @@ module EE
              def perform!
                return unless limit.exceeded?

-                retry_optimistic_lock(pipeline) do
+                retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_activity') do
                  pipeline.drop!(:activity_limit_exceeded)
                  limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
                end

--- a/ee/lib/ee/gitlab/ci/pipeline/chain/limit/job_activity.rb
+++ b/ee/lib/ee/gitlab/ci/pipeline/chain/limit/job_activity.rb
@@ -25,7 +25,7 @@ module EE
              def perform!
                return unless limit.exceeded?

-                retry_optimistic_lock(pipeline) do
+                retry_optimistic_lock(pipeline, name: 'ci_pipeline_chain_limit_job_activity') do
                  pipeline.drop!(:job_activity_limit_exceeded)
                  limit.log_error!(project_id: project.id, plan: project.actual_plan_name)
                end

--- a/lib/gitlab/ci/pipeline/chain/cancel_pending_pipelines.rb
+++ b/lib/gitlab/ci/pipeline/chain/cancel_pending_pipelines.rb
@@ -10,7 +10,7 @@ module Gitlab
          def perform!
            return unless project.auto_cancel_pending_pipelines?

-            Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines) do |cancelables|
+            Gitlab::OptimisticLocking.retry_lock(auto_cancelable_pipelines, name: 'cancel_pending_pipelines') do |cancelables|
              cancelables.find_each do |cancelable|
                cancelable.auto_cancel_running(pipeline)
              end

--- a/lib/gitlab/optimistic_locking.rb
+++ b/lib/gitlab/optimistic_locking.rb
@@ -2,22 +2,49 @@

 module Gitlab
  module OptimisticLocking
+    MAX_RETRIES = 100
+
    module_function

-    def retry_lock(subject, retries = nil, &block)
-      retries ||= 100
-      # TODO(Observability): We should be recording details of the number of retries and the duration of the total execution here
+    def retry_lock(subject, max_retries = MAX_RETRIES, name:, &block)
+      start_time = Gitlab::Metrics::System.monotonic_time
+      retry_attempts = 0
+
+      begin
        ActiveRecord::Base.transaction do
          yield(subject)
        end
      rescue ActiveRecord::StaleObjectError
-      retries -= 1
-      raise unless retries >= 0
+        raise unless retry_attempts < max_retries

        subject.reset
+
+        retry_attempts += 1
        retry
+      ensure
+        elapsed_time = Gitlab::Metrics::System.monotonic_time - start_time
+
+        log_optimistic_lock_retries(
+          name: name,
+          retry_attempts: retry_attempts,
+          elapsed_time: elapsed_time)
+      end
    end

    alias_method :retry_optimistic_lock, :retry_lock
+
+    def log_optimistic_lock_retries(name:, retry_attempts:, elapsed_time:)
+      return unless retry_attempts > 0
+
+      retry_lock_logger.info(
+        message: "Optimistic Lock released with retries",
+        name: name,
+        retries: retry_attempts,
+        time_s: elapsed_time)
+    end
+
+    def retry_lock_logger
+      @retry_lock_logger ||= Gitlab::Services::Logger.build
+    end
  end
 end
--- a/spec/lib/gitlab/optimistic_locking_spec.rb
+++ b/spec/lib/gitlab/optimistic_locking_spec.rb
@@ -7,35 +7,81 @@ RSpec.describe Gitlab::OptimisticLocking do
  let!(:pipeline2) { Ci::Pipeline.find(pipeline.id) }

  describe '#retry_lock' do
-    it 'does not reload object if state changes' do
+    let(:name) { 'optimistic_locking_spec' }
+
+    context 'when state changed successfully without retries' do
+      subject do
+        described_class.retry_lock(pipeline, name: name) do |lock_subject|
+          lock_subject.succeed
+        end
+      end
+
+      it 'does not reload object' do
        expect(pipeline).not_to receive(:reset)
        expect(pipeline).to receive(:succeed).and_call_original

-      described_class.retry_lock(pipeline) do |subject|
-        subject.succeed
+        subject
+      end
+
+      it 'does not create log record' do
+        expect(described_class.retry_lock_logger).not_to receive(:info)
+
+        subject
      end
    end

-    it 'retries action if exception is raised' do
+    context 'when at least one retry happened, the change succeeded' do
+      subject do
+        described_class.retry_lock(pipeline2, name: 'optimistic_locking_spec') do |lock_subject|
+          lock_subject.drop
+        end
+      end
+
+      it 'completes the action' do
        pipeline.succeed

        expect(pipeline2).to receive(:reset).and_call_original
        expect(pipeline2).to receive(:drop).twice.and_call_original

-      described_class.retry_lock(pipeline2) do |subject|
-        subject.drop
+        subject
+      end
+
+      it 'creates a single log record' do
+        pipeline.succeed
+
+        expect(described_class.retry_lock_logger)
+          .to receive(:info)
+          .once
+          .with(hash_including(:time_s, name: name, retries: 1))
+
+        subject
+      end
    end
+
+    context 'when MAX_RETRIES attempts exceeded' do
+      subject do
+        described_class.retry_lock(pipeline, max_retries, name: name) do |lock_subject|
+          lock_subject.lock_version = 100
+          lock_subject.drop
+        end
+      end
+
+      let(:max_retries) { 2 }
+
+      it 'raises an exception' do
+        expect(pipeline).to receive(:drop).exactly(max_retries + 1).times.and_call_original
+
+        expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
      end

-    it 'raises exception when too many retries' do
-      expect(pipeline).to receive(:drop).twice.and_call_original
+      it 'creates a single log record' do
+        expect(described_class.retry_lock_logger)
+          .to receive(:info)
+          .once
+          .with(hash_including(:time_s, name: name, retries: max_retries))

-      expect do
-        described_class.retry_lock(pipeline, 1) do |subject|
-          subject.lock_version = 100
-          subject.drop
+        expect { subject }.to raise_error(ActiveRecord::StaleObjectError)
      end
-      end.to raise_error(ActiveRecord::StaleObjectError)
    end
  end