Commit 73a78f1f authored by Mike Kozono's avatar Mike Kozono

Increase retry backoff cap for missing on primary

- Affects blob data types replicated by the Geo Self-Service Framework.
- Also fixes failure message truncation
parent 6fa714ee
......@@ -56,6 +56,8 @@ module Geo::ReplicableRegistry
included do
include ::Delay
attr_accessor :custom_max_retry_wait_time
scope :failed, -> { with_state(:failed) }
scope :needs_sync_again, -> { failed.retry_due.order(Gitlab::Database.nulls_first_order(:retry_at)) }
scope :never_attempted_sync, -> { pending.where(last_synced_at: nil) }
......@@ -82,7 +84,7 @@ module Geo::ReplicableRegistry
before_transition any => :failed do |registry, _|
registry.retry_count += 1
registry.retry_at = registry.next_retry_time(registry.retry_count)
registry.retry_at = registry.next_retry_time(registry.retry_count, registry.custom_max_retry_wait_time)
end
before_transition any => :synced do |registry, _|
......@@ -117,10 +119,12 @@ module Geo::ReplicableRegistry
#
# @param [String] message error information
# @param [StandardError] error exception
def failed!(message, error = nil)
# @param [Boolean] missing_on_primary if the resource is missing on the primary
def failed!(message:, error: nil, missing_on_primary: nil)
self.last_sync_failure = message
self.last_sync_failure += ": #{error.message}" if error.respond_to?(:message)
self.last_sync_failure.truncate(255)
self.last_sync_failure = self.last_sync_failure.truncate(255)
self.custom_max_retry_wait_time = missing_on_primary ? 4.hours : nil
super()
end
......
......@@ -43,7 +43,7 @@ module Geo
message = download_result.reason
error = download_result.extra_details&.fetch(:error, nil)
registry.failed!(message, error)
registry.failed!(message: message, error: error, missing_on_primary: download_result.primary_missing_file)
end
log_download(mark_as_synced, download_result, start_time)
......
......@@ -167,7 +167,7 @@ module Geo
registry = replicator.registry
registry.force_to_redownload = force_to_redownload
registry.failed!(message, error)
registry.failed!(message: message, error: error)
repository.clean_stale_repository_files
end
......
......@@ -60,6 +60,16 @@ RSpec.describe Geo::BlobDownloadService do
expect(registry_class.last).to be_failed
end
it 'caps retry wait time to 1 hour' do
registry = replicator.registry
registry.retry_count = 9999
registry.save!
subject.execute
expect(registry.reload.retry_at).to be_within(10.minutes).of(1.hour.from_now)
end
end
context "when the file is missing on the primary" do
......@@ -77,6 +87,16 @@ RSpec.describe Geo::BlobDownloadService do
expect(registry_class.last).to be_failed
end
it 'caps retry wait time to 4 hours' do
registry = replicator.registry
registry.retry_count = 9999
registry.save!
subject.execute
expect(registry.reload.retry_at).to be_within(10.minutes).of(4.hours.from_now)
end
end
context "when the feature flag geo_treat_missing_files_as_sync_failed is disabled" do
......@@ -97,6 +117,12 @@ RSpec.describe Geo::BlobDownloadService do
expect(registry_class.last).to be_synced
end
it 'does not set retry_at because it is not a failure' do
subject.execute
expect(registry_class.last.retry_at).to be_nil
end
end
end
end
......
......@@ -76,4 +76,75 @@ RSpec.shared_examples 'a Geo framework registry' do
expect(record2.reload.state).to eq described_class::STATE_VALUES[:started]
end
end
describe '#failed!' do
let(:registry) { create(registry_class_factory, :started) }
let(:message) { 'Foo' }
it 'sets last_sync_failure with message' do
registry.failed!(message: message)
expect(registry.last_sync_failure).to include(message)
end
it 'truncates a long last_sync_failure' do
registry.failed!(message: 'a' * 256)
expect(registry.last_sync_failure).to eq('a' * 252 + '...')
end
it 'increments retry_count' do
registry.failed!(message: message)
expect(registry.retry_count).to eq(1)
registry.start
registry.failed!(message: message)
expect(registry.retry_count).to eq(2)
end
it 'sets retry_at to a time in the future' do
now = Time.current
registry.failed!(message: message)
expect(registry.retry_at >= now).to be_truthy
end
context 'when an error is given' do
it 'includes error.message in last_sync_failure' do
registry.failed!(message: message, error: StandardError.new('bar'))
expect(registry.last_sync_failure).to eq('Foo: bar')
end
end
context 'when missing_on_primary is not given' do
it 'caps retry_at to default 1 hour' do
registry.retry_count = 9999
registry.failed!(message: message)
expect(registry.retry_at).to be_within(10.minutes).of(1.hour.from_now)
end
end
context 'when missing_on_primary is falsey' do
it 'caps retry_at to default 1 hour' do
registry.retry_count = 9999
registry.failed!(message: message, missing_on_primary: false)
expect(registry.retry_at).to be_within(10.minutes).of(1.hour.from_now)
end
end
context 'when missing_on_primary is truthy' do
it 'caps retry_at to 4 hours' do
registry.retry_count = 9999
registry.failed!(message: message, missing_on_primary: true)
expect(registry.retry_at).to be_within(10.minutes).of(4.hours.from_now)
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment