Commit d7874890 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'fix-bg-migration-job-retaining-leason-too-much' into 'master'

Fix BG migration causes many unprocessed jobs due to the retry strategy with concurrency limit

See merge request gitlab-org/gitlab!68763
parents a05e18cf 1d3d0553
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
include ApplicationWorker include ApplicationWorker
MAX_LEASE_ATTEMPTS = 5
data_consistency :always data_consistency :always
sidekiq_options retry: 3 sidekiq_options retry: 3
...@@ -30,10 +32,11 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -30,10 +32,11 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
# lease_attempts - The number of times we will try to obtain an exclusive # lease_attempts - The number of times we will try to obtain an exclusive
# lease on the class before giving up. See MR for more discussion. # lease on the class before giving up. See MR for more discussion.
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956 # https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45298#note_434304956
def perform(class_name, arguments = [], lease_attempts = 5) def perform(class_name, arguments = [], lease_attempts = MAX_LEASE_ATTEMPTS)
with_context(caller_id: class_name.to_s) do with_context(caller_id: class_name.to_s) do
retried = lease_attempts != MAX_LEASE_ATTEMPTS
attempts_left = lease_attempts - 1 attempts_left = lease_attempts - 1
should_perform, ttl = perform_and_ttl(class_name, attempts_left) should_perform, ttl = perform_and_ttl(class_name, attempts_left, retried)
break if should_perform.nil? break if should_perform.nil?
...@@ -50,13 +53,13 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -50,13 +53,13 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
end end
end end
def perform_and_ttl(class_name, attempts_left) def perform_and_ttl(class_name, attempts_left, retried)
# In test environments `perform_in` will run right away. This can then # In test environments `perform_in` will run right away. This can then
# lead to stack level errors in the above `#perform`. To work around this # lead to stack level errors in the above `#perform`. To work around this
# we'll just perform the migration right away in the test environment. # we'll just perform the migration right away in the test environment.
return [true, nil] if always_perform? return [true, nil] if always_perform?
lease = lease_for(class_name) lease = lease_for(class_name, retried)
lease_obtained = !!lease.try_obtain lease_obtained = !!lease.try_obtain
healthy_db = healthy_database? healthy_db = healthy_database?
perform = lease_obtained && healthy_db perform = lease_obtained && healthy_db
...@@ -82,13 +85,17 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -82,13 +85,17 @@ class BackgroundMigrationWorker # rubocop:disable Scalability/IdempotentWorker
[perform, lease.ttl] [perform, lease.ttl]
end end
def lease_for(class_name) def lease_for(class_name, retried)
Gitlab::ExclusiveLease Gitlab::ExclusiveLease
.new(lease_key_for(class_name), timeout: self.class.minimum_interval) .new(lease_key_for(class_name, retried), timeout: self.class.minimum_interval)
end end
def lease_key_for(class_name) def lease_key_for(class_name, retried)
"#{self.class.name}:#{class_name}" key = "#{self.class.name}:#{class_name}"
# We use a different exclusive lock key for retried jobs to allow them running concurrently with the scheduled jobs.
# See https://gitlab.com/gitlab-org/gitlab/-/merge_requests/68763 for more information.
key += ":retried" if retried
key
end end
def always_perform? def always_perform?
......
...@@ -14,7 +14,17 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -14,7 +14,17 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
describe '#perform' do describe '#perform' do
before do before do
allow(worker).to receive(:jid).and_return(1) allow(worker).to receive(:jid).and_return(1)
expect(worker).to receive(:always_perform?).and_return(false) allow(worker).to receive(:always_perform?).and_return(false)
end
it 'can run scheduled job and retried job concurrently' do
expect(Gitlab::BackgroundMigration)
.to receive(:perform)
.with('Foo', [10, 20])
.exactly(2).time
worker.perform('Foo', [10, 20])
worker.perform('Foo', [10, 20], described_class::MAX_LEASE_ATTEMPTS - 1)
end end
context 'when lease can be obtained' do context 'when lease can be obtained' do
...@@ -39,7 +49,7 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -39,7 +49,7 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
before do before do
expect(Gitlab::BackgroundMigration).not_to receive(:perform) expect(Gitlab::BackgroundMigration).not_to receive(:perform)
worker.lease_for('Foo').try_obtain worker.lease_for('Foo', false).try_obtain
end end
it 'reschedules the migration and decrements the lease_attempts' do it 'reschedules the migration and decrements the lease_attempts' do
...@@ -51,6 +61,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -51,6 +61,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end end
context 'when lease_attempts is 1' do context 'when lease_attempts is 1' do
before do
worker.lease_for('Foo', true).try_obtain
end
it 'reschedules the migration and decrements the lease_attempts' do it 'reschedules the migration and decrements the lease_attempts' do
expect(described_class) expect(described_class)
.to receive(:perform_in) .to receive(:perform_in)
...@@ -61,6 +75,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do ...@@ -61,6 +75,10 @@ RSpec.describe BackgroundMigrationWorker, :clean_gitlab_redis_shared_state do
end end
context 'when lease_attempts is 0' do context 'when lease_attempts is 0' do
before do
worker.lease_for('Foo', true).try_obtain
end
it 'gives up performing the migration' do it 'gives up performing the migration' do
expect(described_class).not_to receive(:perform_in) expect(described_class).not_to receive(:perform_in)
expect(Sidekiq.logger).to receive(:warn).with( expect(Sidekiq.logger).to receive(:warn).with(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment