Commit 9cde7509 authored by Patrick Bair's avatar Patrick Bair Committed by Mayra Cabrera

Add helper to create batched background migration

Add a new migration helper to simplify starting a batched background
migration, by defaulting values that don't always need to be specified.
parent 3f2c4621
...@@ -4,8 +4,12 @@ module Gitlab ...@@ -4,8 +4,12 @@ module Gitlab
module Database module Database
module Migrations module Migrations
module BackgroundMigrationHelpers module BackgroundMigrationHelpers
BACKGROUND_MIGRATION_BATCH_SIZE = 1_000 # Number of rows to process per job BATCH_SIZE = 1_000 # Number of rows to process per job
BACKGROUND_MIGRATION_JOB_BUFFER_SIZE = 1_000 # Number of jobs to bulk queue at a time SUB_BATCH_SIZE = 100 # Number of rows to process per sub-batch
JOB_BUFFER_SIZE = 1_000 # Number of jobs to bulk queue at a time
BATCH_CLASS_NAME = 'PrimaryKeyBatchingStrategy' # Default batch class for batched migrations
BATCH_MIN_VALUE = 1 # Default minimum value for batched migrations
BATCH_MIN_DELAY = 2.minutes.freeze # Minimum delay between batched migrations
# Bulk queues background migration jobs for an entire table, batched by ID range. # Bulk queues background migration jobs for an entire table, batched by ID range.
# "Bulk" meaning many jobs will be pushed at a time for efficiency. # "Bulk" meaning many jobs will be pushed at a time for efficiency.
...@@ -31,7 +35,7 @@ module Gitlab ...@@ -31,7 +35,7 @@ module Gitlab
# # do something # # do something
# end # end
# end # end
def bulk_queue_background_migration_jobs_by_range(model_class, job_class_name, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE) def bulk_queue_background_migration_jobs_by_range(model_class, job_class_name, batch_size: BATCH_SIZE)
raise "#{model_class} does not have an ID to use for batch ranges" unless model_class.column_names.include?('id') raise "#{model_class} does not have an ID to use for batch ranges" unless model_class.column_names.include?('id')
jobs = [] jobs = []
...@@ -40,7 +44,7 @@ module Gitlab ...@@ -40,7 +44,7 @@ module Gitlab
model_class.each_batch(of: batch_size) do |relation| model_class.each_batch(of: batch_size) do |relation|
start_id, end_id = relation.pluck("MIN(#{table_name}.id)", "MAX(#{table_name}.id)").first start_id, end_id = relation.pluck("MIN(#{table_name}.id)", "MAX(#{table_name}.id)").first
if jobs.length >= BACKGROUND_MIGRATION_JOB_BUFFER_SIZE if jobs.length >= JOB_BUFFER_SIZE
# Note: This code path generally only helps with many millions of rows # Note: This code path generally only helps with many millions of rows
# We push multiple jobs at a time to reduce the time spent in # We push multiple jobs at a time to reduce the time spent in
# Sidekiq/Redis operations. We're using this buffer based approach so we # Sidekiq/Redis operations. We're using this buffer based approach so we
...@@ -89,7 +93,7 @@ module Gitlab ...@@ -89,7 +93,7 @@ module Gitlab
# # do something # # do something
# end # end
# end # end
def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BACKGROUND_MIGRATION_BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false, primary_column_name: :id) def queue_background_migration_jobs_by_range_at_intervals(model_class, job_class_name, delay_interval, batch_size: BATCH_SIZE, other_job_arguments: [], initial_delay: 0, track_jobs: false, primary_column_name: :id)
raise "#{model_class} does not have an ID column of #{primary_column_name} to use for batch ranges" unless model_class.column_names.include?(primary_column_name.to_s) raise "#{model_class} does not have an ID column of #{primary_column_name} to use for batch ranges" unless model_class.column_names.include?(primary_column_name.to_s)
raise "#{primary_column_name} is not an integer column" unless model_class.columns_hash[primary_column_name.to_s].type == :integer raise "#{primary_column_name} is not an integer column" unless model_class.columns_hash[primary_column_name.to_s].type == :integer
...@@ -127,6 +131,79 @@ module Gitlab ...@@ -127,6 +131,79 @@ module Gitlab
final_delay final_delay
end end
# Creates a batched background migration for the given table. A batched migration runs one job
# at a time, computing the bounds of the next batch based on the current migration settings and the previous
# batch bounds. Each job's execution status is tracked in the database as the migration runs. The given job
# class must be present in the Gitlab::BackgroundMigration module, and the batch class (if specified) must be
# present in the Gitlab::BackgroundMigration::BatchingStrategies module.
#
# job_class_name - The background migration job class as a string
# batch_table_name - The name of the table the migration will batch over
# batch_column_name - The name of the column the migration will batch over
# job_arguments - Extra arguments to pass to the job instance when the migration runs
# job_interval - The pause interval between each job's execution, minimum of 2 minutes
# batch_min_value - The value in the column the batching will begin at
# batch_max_value - The value in the column the batching will end at, defaults to `SELECT MAX(batch_column)`
# batch_class_name - The name of the class that will be called to find the range of each next batch
# batch_size - The maximum number of rows per job
# sub_batch_size - The maximum number of rows processed per "iteration" within the job
#
#
# *Returns the created BatchedMigration record*
#
# Example:
#
# queue_batched_background_migration(
# 'CopyColumnUsingBackgroundMigrationJob',
# :events,
# :id,
# job_interval: 2.minutes,
# other_job_arguments: ['column1', 'column2'])
#
# Where the the background migration exists:
#
# class Gitlab::BackgroundMigration::CopyColumnUsingBackgroundMigrationJob
# def perform(start_id, end_id, batch_table, batch_column, sub_batch_size, *other_args)
# # do something
# end
# end
def queue_batched_background_migration( # rubocop:disable Metrics/ParameterLists
job_class_name,
batch_table_name,
batch_column_name,
*job_arguments,
job_interval:,
batch_min_value: BATCH_MIN_VALUE,
batch_max_value: nil,
batch_class_name: BATCH_CLASS_NAME,
batch_size: BATCH_SIZE,
sub_batch_size: SUB_BATCH_SIZE
)
job_interval = BATCH_MIN_DELAY if job_interval < BATCH_MIN_DELAY
batch_max_value ||= connection.select_value(<<~SQL)
SELECT MAX(#{connection.quote_column_name(batch_column_name)})
FROM #{connection.quote_table_name(batch_table_name)}
SQL
migration_status = batch_max_value.nil? ? :finished : :active
batch_max_value ||= batch_min_value
Gitlab::Database::BackgroundMigration::BatchedMigration.create!(
job_class_name: job_class_name,
table_name: batch_table_name,
column_name: batch_column_name,
interval: job_interval,
min_value: batch_min_value,
max_value: batch_max_value,
batch_class_name: batch_class_name,
batch_size: batch_size,
sub_batch_size: sub_batch_size,
job_arguments: job_arguments,
status: migration_status)
end
def perform_background_migration_inline? def perform_background_migration_inline?
Rails.env.test? || Rails.env.development? Rails.env.test? || Rails.env.development?
end end
......
...@@ -21,7 +21,7 @@ RSpec.describe Gitlab::Database::Migrations::BackgroundMigrationHelpers do ...@@ -21,7 +21,7 @@ RSpec.describe Gitlab::Database::Migrations::BackgroundMigrationHelpers do
context 'with enough rows to bulk queue jobs more than once' do context 'with enough rows to bulk queue jobs more than once' do
before do before do
stub_const('Gitlab::Database::Migrations::BackgroundMigrationHelpers::BACKGROUND_MIGRATION_JOB_BUFFER_SIZE', 1) stub_const('Gitlab::Database::Migrations::BackgroundMigrationHelpers::JOB_BUFFER_SIZE', 1)
end end
it 'queues jobs correctly' do it 'queues jobs correctly' do
...@@ -262,6 +262,120 @@ RSpec.describe Gitlab::Database::Migrations::BackgroundMigrationHelpers do ...@@ -262,6 +262,120 @@ RSpec.describe Gitlab::Database::Migrations::BackgroundMigrationHelpers do
end end
end end
describe '#queue_batched_background_migration' do
it 'creates the database record for the migration' do
expect do
model.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
job_interval: 5.minutes,
batch_min_value: 5,
batch_max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to have_attributes(
job_class_name: 'MyJobClass',
table_name: 'projects',
column_name: 'id',
interval: 300,
min_value: 5,
max_value: 1000,
batch_class_name: 'MyBatchClass',
batch_size: 100,
sub_batch_size: 10,
job_arguments: %w[],
status: 'active')
end
context 'when the job interval is lower than the minimum' do
let(:minimum_delay) { described_class::BATCH_MIN_DELAY }
it 'sets the job interval to the minimum value' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: minimum_delay - 1.minute)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.interval).to eq(minimum_delay)
end
end
context 'when additional arguments are passed to the method' do
it 'saves the arguments on the database record' do
expect do
model.queue_batched_background_migration(
'MyJobClass',
:projects,
:id,
'my',
'arguments',
job_interval: 5.minutes,
batch_max_value: 1000)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to have_attributes(
job_class_name: 'MyJobClass',
table_name: 'projects',
column_name: 'id',
interval: 300,
min_value: 1,
max_value: 1000,
job_arguments: %w[my arguments])
end
end
context 'when the max_value is not given' do
context 'when records exist in the database' do
let!(:event1) { create(:event) }
let!(:event2) { create(:event) }
let!(:event3) { create(:event) }
it 'creates the record with the current max value' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.max_value).to eq(event3.id)
end
it 'creates the record with an active status' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to be_active
end
end
context 'when the database is empty' do
it 'sets the max value to the min value' do
expect do
model.queue_batched_background_migration('MyJobClass', :events, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
created_migration = Gitlab::Database::BackgroundMigration::BatchedMigration.last
expect(created_migration.max_value).to eq(created_migration.min_value)
end
it 'creates the record with a finished status' do
expect do
model.queue_batched_background_migration('MyJobClass', :projects, :id, job_interval: 5.minutes)
end.to change { Gitlab::Database::BackgroundMigration::BatchedMigration.count }.by(1)
expect(Gitlab::Database::BackgroundMigration::BatchedMigration.last).to be_finished
end
end
end
end
describe '#migrate_async' do describe '#migrate_async' do
it 'calls BackgroundMigrationWorker.perform_async' do it 'calls BackgroundMigrationWorker.perform_async' do
expect(BackgroundMigrationWorker).to receive(:perform_async).with("Class", "hello", "world") expect(BackgroundMigrationWorker).to receive(:perform_async).with("Class", "hello", "world")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment