Commit b33661d6 authored by Stan Hu's avatar Stan Hu

Add ExclusiveLease guards for RepositoryCheck::{DispatchWorker,BatchWorker}

We saw in production that DispatchWorker was running about twice an hour,
which would schedule twice as many jobs as it should.

For some reason, BatchWorker was running 1000 times per hour, possibly
due to Sidekiq RSS kills that caused these jobs to restart.

Adding an ExclusiveLease prevents these jobs from running more
than they should.

Relates to https://gitlab.com/gitlab-com/infrastructure/issues/4526
parent a291bcdf
......@@ -4,9 +4,11 @@ module RepositoryCheck
class BatchWorker
include ApplicationWorker
include RepositoryCheckQueue
include ExclusiveLeaseGuard
RUN_TIME = 3600
BATCH_SIZE = 10_000
LEASE_TIMEOUT = 1.hour
attr_reader :shard_name
......@@ -16,6 +18,20 @@ module RepositoryCheck
return unless Gitlab::CurrentSettings.repository_checks_enabled
return unless Gitlab::ShardHealthCache.healthy_shard?(shard_name)
try_obtain_lease do
perform_repository_checks
end
end
def lease_timeout
LEASE_TIMEOUT
end
def lease_key
"repository_check_batch_worker:#{shard_name}"
end
def perform_repository_checks
start = Time.now
# This loop will break after a little more than one hour ('a little
......@@ -26,7 +42,7 @@ module RepositoryCheck
project_ids.each do |project_id|
break if Time.now - start >= RUN_TIME
next unless try_obtain_lease(project_id)
next unless try_obtain_lease_for_project(project_id)
SingleRepositoryWorker.new.perform(project_id)
end
......@@ -60,7 +76,7 @@ module RepositoryCheck
Project.where(repository_storage: shard_name)
end
def try_obtain_lease(id)
def try_obtain_lease_for_project(id)
# Use a 24-hour timeout because on servers/projects where 'git fsck' is
# super slow we definitely do not want to run it twice in parallel.
Gitlab::ExclusiveLease.new(
......
......@@ -3,13 +3,22 @@ module RepositoryCheck
include ApplicationWorker
include CronjobQueue
include ::EachShardWorker
include ExclusiveLeaseGuard
LEASE_TIMEOUT = 1.hour
def perform
return unless Gitlab::CurrentSettings.repository_checks_enabled
try_obtain_lease do
each_eligible_shard do |shard_name|
RepositoryCheck::BatchWorker.perform_async(shard_name)
end
end
end
def lease_timeout
LEASE_TIMEOUT
end
end
end
......@@ -62,4 +62,12 @@ describe RepositoryCheck::BatchWorker do
expect(subject.perform(shard_name)).to eq([])
end
it 'does not run if the exclusive lease is taken' do
allow(subject).to receive(:try_obtain_lease).and_return(false)
expect(subject).not_to receive(:perform_repository_checks)
subject.perform(shard_name)
end
end
......@@ -11,6 +11,14 @@ describe RepositoryCheck::DispatchWorker do
subject.perform
end
it 'does nothing if the exclusive lease is taken' do
allow(subject).to receive(:try_obtain_lease).and_return(false)
expect(RepositoryCheck::BatchWorker).not_to receive(:perform_async)
subject.perform
end
it 'dispatches work to RepositoryCheck::BatchWorker' do
expect(RepositoryCheck::BatchWorker).to receive(:perform_async).at_least(:once)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment