Commit 8b714a5f authored by Patrick Bair's avatar Patrick Bair

Merge branch '351014-split-the-job-and-retry' into 'master'

Resolve "Split the job and retry"

See merge request gitlab-org/gitlab!81911
parents 92b4bfe2 23af42eb
......@@ -3,6 +3,8 @@
module Gitlab
module Database
module BackgroundMigration
SplitAndRetryError = Class.new(StandardError)
class BatchedJob < SharedModel
include EachBatch
include FromUnion
......@@ -11,6 +13,8 @@ module Gitlab
MAX_ATTEMPTS = 3
STUCK_JOBS_TIMEOUT = 1.hour.freeze
TIMEOUT_EXCEPTIONS = [ActiveRecord::StatementTimeout, ActiveRecord::ConnectionTimeoutError,
ActiveRecord::AdapterTimeout, ActiveRecord::LockWaitTimeout].freeze
belongs_to :batched_migration, foreign_key: :batched_background_migration_id
has_many :batched_job_transition_logs, foreign_key: :batched_background_migration_job_id
......@@ -51,6 +55,16 @@ module Gitlab
job.metrics = {}
end
after_transition any => :failed do |job, transition|
error_hash = transition.args.find { |arg| arg[:error].present? }
exception = error_hash&.fetch(:error)
job.split_and_retry! if job.can_split?(exception)
rescue SplitAndRetryError => error
Gitlab::AppLogger.error(message: error.message, batched_job_id: job.id)
end
after_transition do |job, transition|
error_hash = transition.args.find { |arg| arg[:error].present? }
......@@ -79,13 +93,17 @@ module Gitlab
duration.to_f / batched_migration.interval
end
def can_split?(exception)
attempts >= MAX_ATTEMPTS && TIMEOUT_EXCEPTIONS.include?(exception&.class) && batch_size > sub_batch_size
end
def split_and_retry!
with_lock do
raise 'Only failed jobs can be split' unless failed?
raise SplitAndRetryError, 'Only failed jobs can be split' unless failed?
new_batch_size = batch_size / 2
raise 'Job cannot be split further' if new_batch_size < 1
raise SplitAndRetryError, 'Job cannot be split further' if new_batch_size < 1
batching_strategy = batched_migration.batch_class.new(connection: self.class.connection)
next_batch_bounds = batching_strategy.next_batch(
......
......@@ -7,6 +7,8 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedJob, type: :model d
it { is_expected.to be_a Gitlab::Database::SharedModel }
it { expect(described_class::TIMEOUT_EXCEPTIONS).to match_array [ActiveRecord::StatementTimeout, ActiveRecord::ConnectionTimeoutError, ActiveRecord::AdapterTimeout, ActiveRecord::LockWaitTimeout] }
describe 'associations' do
it { is_expected.to belong_to(:batched_migration).with_foreign_key(:batched_background_migration_id) }
it { is_expected.to have_many(:batched_job_transition_logs).with_foreign_key(:batched_background_migration_job_id) }
......@@ -15,6 +17,8 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedJob, type: :model d
describe 'state machine' do
let_it_be(:job) { create(:batched_background_migration_job, :failed) }
it { expect(described_class.state_machine.states.map(&:name)).to eql(%i(pending running failed succeeded)) }
context 'when a job is running' do
it 'logs the transition' do
expect(Gitlab::AppLogger).to receive(:info).with( { batched_job_id: job.id, message: 'BatchedJob transition', new_state: :running, previous_state: :failed } )
......@@ -47,6 +51,51 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedJob, type: :model d
end
end
context 'when a job fails the number of max times' do
let(:max_times) { described_class::MAX_ATTEMPTS }
let!(:job) { create(:batched_background_migration_job, :running, batch_size: 10, min_value: 6, max_value: 15, attempts: max_times) }
context 'when job can be split' do
let(:exception) { ActiveRecord::StatementTimeout.new('Timeout!') }
before do
allow_next_instance_of(Gitlab::BackgroundMigration::BatchingStrategies::PrimaryKeyBatchingStrategy) do |batch_class|
allow(batch_class).to receive(:next_batch).and_return([6, 10])
end
end
it 'splits the job into two retriable jobs' do
expect { job.failure!(error: exception) }.to change { job.batched_migration.batched_jobs.retriable.count }.from(0).to(2)
end
end
context 'when the job cannot be split' do
let(:exception) { ActiveRecord::StatementTimeout.new('Timeout!') }
let(:max_times) { described_class::MAX_ATTEMPTS }
let!(:job) { create(:batched_background_migration_job, :running, batch_size: 50, sub_batch_size: 20, min_value: 6, max_value: 15, attempts: max_times) }
let(:error_message) { 'Job cannot be split further' }
let(:split_and_retry_exception) { Gitlab::Database::BackgroundMigration::SplitAndRetryError.new(error_message) }
before do
allow(job).to receive(:split_and_retry!).and_raise(split_and_retry_exception)
end
it 'does not split the job' do
expect { job.failure!(error: exception) }.not_to change { job.batched_migration.batched_jobs.retriable.count }
end
it 'keeps the same job attributes' do
expect { job.failure!(error: exception) }.not_to change { job }
end
it 'logs the error' do
expect(Gitlab::AppLogger).to receive(:error).with( { message: error_message, batched_job_id: job.id } )
job.failure!(error: exception)
end
end
end
context 'when a job fails' do
let(:job) { create(:batched_background_migration_job, :running) }
......@@ -147,6 +196,49 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedJob, type: :model d
end
end
describe '#can_split?' do
subject { job.can_split?(exception) }
context 'when the number of attempts is greater than the limit and the batch_size is greater than the sub_batch_size' do
let(:job) { create(:batched_background_migration_job, :failed, batch_size: 4, sub_batch_size: 2, attempts: described_class::MAX_ATTEMPTS + 1) }
context 'when is a timeout exception' do
let(:exception) { ActiveRecord::StatementTimeout.new }
it { expect(subject).to be_truthy }
end
context 'when is not a timeout exception' do
let(:exception) { RuntimeError.new }
it { expect(subject).to be_falsey }
end
end
context 'when the number of attempts is lower than the limit and the batch_size is greater than the sub_batch_size' do
let(:job) { create(:batched_background_migration_job, :failed, batch_size: 4, sub_batch_size: 2, attempts: described_class::MAX_ATTEMPTS - 1) }
context 'when is a timeout exception' do
let(:exception) { ActiveRecord::StatementTimeout.new }
it { expect(subject).to be_falsey }
end
context 'when is not a timeout exception' do
let(:exception) { RuntimeError.new }
it { expect(subject).to be_falsey }
end
end
context 'when the batch_size is lower than the sub_batch_size' do
let(:job) { create(:batched_background_migration_job, :failed, batch_size: 2, sub_batch_size: 4) }
let(:exception) { ActiveRecord::StatementTimeout.new }
it { expect(subject).to be_falsey }
end
end
describe '#time_efficiency' do
subject { job.time_efficiency }
......
......@@ -193,6 +193,7 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedMigrationWrapper, '
it_behaves_like 'an error is raised', RuntimeError.new('Something broke!')
it_behaves_like 'an error is raised', SignalException.new('SIGTERM')
it_behaves_like 'an error is raised', ActiveRecord::StatementTimeout.new('Timeout!')
end
context 'when the batched background migration does not inherit from BaseJob' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment