Commit e9478858 authored by Valery Sizov's avatar Valery Sizov

Geo: Add progressive backoff for file download retries[ci skip]

parent 3ec71075
class Geo::FileRegistry < Geo::BaseRegistry class Geo::FileRegistry < Geo::BaseRegistry
scope :failed, -> { where(success: false) } scope :failed, -> { where(success: false) }
scope :synced, -> { where(success: true) } scope :synced, -> { where(success: true) }
scope :to_be_retried, -> { where('retry_at < ?', Time.now) }
end end
...@@ -6,6 +6,7 @@ module Geo ...@@ -6,6 +6,7 @@ module Geo
class BaseSyncService class BaseSyncService
include ExclusiveLeaseGuard include ExclusiveLeaseGuard
include ::Gitlab::Geo::ProjectLogHelpers include ::Gitlab::Geo::ProjectLogHelpers
include Delay
class << self class << self
attr_accessor :type attr_accessor :type
...@@ -77,11 +78,6 @@ module Geo ...@@ -77,11 +78,6 @@ module Geo
(RETRY_BEFORE_REDOWNLOAD..RETRY_LIMIT) === retry_count (RETRY_BEFORE_REDOWNLOAD..RETRY_LIMIT) === retry_count
end end
# Progressive backoff
def delay(retry_count = 0)
(retry_count ** 4) + 15 + (rand(30) * (retry_count + 1))
end
def sync_repository def sync_repository
raise NotImplementedError, 'This class should implement sync_repository method' raise NotImplementedError, 'This class should implement sync_repository method'
end end
...@@ -138,7 +134,7 @@ module Geo ...@@ -138,7 +134,7 @@ module Geo
if started_at if started_at
attrs["last_#{type}_synced_at"] = started_at attrs["last_#{type}_synced_at"] = started_at
attrs["#{type}_retry_count"] = retry_count + 1 attrs["#{type}_retry_count"] = retry_count + 1
attrs["#{type}_retry_at"] = Time.now + delay(retry_count).seconds attrs["#{type}_retry_at"] = Time.now + delay(attrs["#{type}_retry_count"]).seconds
end end
if finished_at if finished_at
......
...@@ -2,6 +2,8 @@ module Geo ...@@ -2,6 +2,8 @@ module Geo
class FileDownloadService < FileService class FileDownloadService < FileService
LEASE_TIMEOUT = 8.hours.freeze LEASE_TIMEOUT = 8.hours.freeze
include Delay
def execute def execute
try_obtain_lease do |lease| try_obtain_lease do |lease|
start_time = Time.now start_time = Time.now
...@@ -45,6 +47,13 @@ module Geo ...@@ -45,6 +47,13 @@ module Geo
transfer.bytes = bytes_downloaded transfer.bytes = bytes_downloaded
transfer.success = success transfer.success = success
unless success
# We don't limit the amount of retries
transfer.retry_count = (transfer.retry_count || 0) + 1
transfer.retry_at = Time.now + delay(transfer.retry_count).seconds
end
transfer.save transfer.save
end end
......
...@@ -29,7 +29,7 @@ module Geo ...@@ -29,7 +29,7 @@ module Geo
if with_backup if with_backup
log_info('Removing backup copy as the repository was redownloaded successfully') log_info('Removing backup copy as the repository was redownloaded successfully')
FileUtils.rm_r(backup_path) FileUtils.rm_rf(backup_path)
end end
update_registry(finished_at: DateTime.now) update_registry(finished_at: DateTime.now)
...@@ -43,11 +43,11 @@ module Geo ...@@ -43,11 +43,11 @@ module Geo
rescue Gitlab::Git::Repository::NoRepository => e rescue Gitlab::Git::Repository::NoRepository => e
log_error('Invalid repository', e) log_error('Invalid repository', e)
registry.update(force_to_redownload_repository: true) registry.update(force_to_redownload_repository: true)
log_info('Expiring caches') expire_repository_caches
project.repository.after_create
ensure ensure
# Backup can only exist if redownload was unsuccessful # Backup can only exist if redownload was unsuccessful
if with_backup && File.exist?(backup_path) if with_backup && File.exist?(backup_path)
FileUtils.rm_rf(actual_path)
FileUtils.mv(backup_path, actual_path) FileUtils.mv(backup_path, actual_path)
end end
end end
......
...@@ -49,11 +49,6 @@ module Geo ...@@ -49,11 +49,6 @@ module Geo
end end
end end
def fetch_wiki_repository_with_backup
# TODO: replace with actual implementation
fetch_wiki_repository
end
def ssh_url_to_wiki def ssh_url_to_wiki
"#{primary_ssh_path_prefix}#{project.full_path}.wiki.git" "#{primary_ssh_path_prefix}#{project.full_path}.wiki.git"
end end
......
module Delay
# Progressive backoff. It's copied from Sidekiq as is
def delay(retry_count = 0)
(retry_count ** 4) + 15 + (rand(30) * (retry_count + 1))
end
end
...@@ -33,6 +33,7 @@ module Geo ...@@ -33,6 +33,7 @@ module Geo
def find_failed_objects(batch_size:) def find_failed_objects(batch_size:)
Geo::FileRegistry Geo::FileRegistry
.failed .failed
.to_be_retried
.limit(batch_size) .limit(batch_size)
.pluck(:file_id, :file_type) .pluck(:file_id, :file_type)
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment