Commit da5e21b3 authored by Andreas Brandl's avatar Andreas Brandl

Optimize batch size for migrations automatically

This is based on what we call "time efficiency". For a single job, time
efficiency is the ratio of total duration to interval. Ideally, this is
close to but smaller than 1.

We use exponential smoothing (EMA) to look at the last 10 jobs and their time
efficiency. With EMA, we tolerate a spiking job better than by just
looking at the most recent job.

We also define upper and lower bounds for the batch size.
parent 5889dadb
---
name: optimize_batched_migrations
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/60133
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/328817
milestone: '13.12'
type: ops
group: group::database
default_enabled: false
# frozen_string_literal: true
module Gitlab
module Database
module BackgroundMigration
# This is an optimizer for throughput of batched migration jobs
#
# The underyling mechanic is based on the concept of time efficiency:
# time efficiency = job duration / interval
# Ideally, this is close but lower than 1 - so we're using time efficiently.
#
# We aim to land in the 90%-98% range, which gives the database a little breathing room
# in between.
#
# The optimizer is based on calculating the exponential moving average of time efficiencies
# for the last N jobs. If we're outside the range, we add 10% to or decrease by 20% of the batch size.
class BatchOptimizer
# Target time efficiency for a job
# Time efficiency is defined as: job duration / interval
TARGET_EFFICIENCY = (0.8..0.98).freeze
# Lower and upper bound for the batch size
ALLOWED_BATCH_SIZE = (1_000..1_000_000).freeze
# Use this batch_size multiplier to increase batch size
INCREASE_MULTIPLIER = 1.1
# Use this batch_size multiplier to decrease batch size
DECREASE_MULTIPLIER = 0.8
attr_reader :migration, :number_of_jobs
def initialize(migration, number_of_jobs: 10)
@migration = migration
@number_of_jobs = number_of_jobs
end
def optimize!
return unless Feature.enabled?(:optimize_batched_migrations, type: :ops)
if multiplier = batch_size_multiplier
migration.batch_size = (migration.batch_size * multiplier).to_i.clamp(ALLOWED_BATCH_SIZE)
migration.save!
end
end
private
def batch_size_multiplier
efficiency = migration.smoothed_time_efficiency(number_of_jobs: number_of_jobs)
return unless efficiency
if TARGET_EFFICIENCY.include?(efficiency)
# We hit the range - no change
nil
elsif efficiency > TARGET_EFFICIENCY.max
# We're above the range - decrease by 20%
DECREASE_MULTIPLIER
else
# We're below the range - increase by 10%
INCREASE_MULTIPLIER
end
end
end
end
end
end
......@@ -19,6 +19,16 @@ module Gitlab
to: :batched_migration, prefix: :migration
attribute :pause_ms, :integer, default: 100
def time_efficiency
return unless succeeded?
return unless finished_at && started_at
duration = finished_at - started_at
# TODO: Switch to individual job interval (prereq: https://gitlab.com/gitlab-org/gitlab/-/issues/328801)
duration.to_f / batched_migration.interval
end
end
end
end
......
......@@ -13,6 +13,9 @@ module Gitlab
has_one :last_job, -> { order(id: :desc) },
class_name: 'Gitlab::Database::BackgroundMigration::BatchedJob',
foreign_key: :batched_background_migration_id
has_many :last_jobs, -> { order(id: :desc) },
class_name: 'Gitlab::Database::BackgroundMigration::BatchedJob',
foreign_key: :batched_background_migration_id
scope :queue_order, -> { order(id: :asc) }
......@@ -76,6 +79,30 @@ module Gitlab
migration_identifier: "%s/%s.%s" % [job_class_name, table_name, column_name]
}
end
def smoothed_time_efficiency(number_of_jobs: 10, alpha: 0.2)
jobs = last_jobs.succeeded.limit(number_of_jobs)
return if jobs.size < number_of_jobs
efficiencies = jobs.map(&:time_efficiency).reject(&:nil?).each_with_index
dividend = efficiencies.reduce(0) do |total, (job_eff, i)|
total + job_eff * (1 - alpha)**i
end
divisor = efficiencies.reduce(0) do |total, (job_eff, i)|
total + (1 - alpha)**i
end
return if divisor == 0
(dividend / divisor).round(2)
end
def optimize!
BatchOptimizer.new(self).optimize!
end
end
end
end
......
......@@ -21,6 +21,8 @@ module Gitlab
def run_migration_job(active_migration)
if next_batched_job = create_next_batched_job!(active_migration)
migration_wrapper.perform(next_batched_job)
active_migration.optimize!
else
finish_active_migration(active_migration)
end
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Database::BackgroundMigration::BatchOptimizer do
describe '#optimize' do
subject { described_class.new(migration, number_of_jobs: number_of_jobs).optimize! }
let(:migration) { create(:batched_background_migration, batch_size: batch_size, sub_batch_size: 100, interval: 120) }
let(:batch_size) { 10_000 }
let_it_be(:number_of_jobs) { 5 }
def mock_efficiency(eff)
expect(migration).to receive(:smoothed_time_efficiency).with(number_of_jobs: number_of_jobs).and_return(eff)
end
it 'with unknown time efficiency, it keeps the batch size' do
mock_efficiency(nil)
expect { subject }.not_to change { migration.reload.batch_size }
end
it 'with a time efficiency of 95%, it keeps the batch size' do
mock_efficiency(0.95)
expect { subject }.not_to change { migration.reload.batch_size }
end
it 'with a time efficiency of 90%, it keeps the batch size' do
mock_efficiency(0.9)
expect { subject }.not_to change { migration.reload.batch_size }
end
it 'with a time efficiency of 70%, it increases the batch size by 10%' do
mock_efficiency(0.7)
expect { subject }.to change { migration.reload.batch_size }.from(10_000).to(11_000)
end
it 'with a time efficiency of 110%, it decreases the batch size by 20%' do
mock_efficiency(1.1)
expect { subject }.to change { migration.reload.batch_size }.from(10_000).to(8_000)
end
context 'reaching the upper limit for the batch size' do
let(:batch_size) { 950_000 }
it 'caps the batch size at 10M' do
mock_efficiency(0.7)
expect { subject }.to change { migration.reload.batch_size }.to(1_000_000)
end
end
context 'reaching the lower limit for the batch size' do
let(:batch_size) { 1_050 }
it 'caps the batch size at 1k' do
mock_efficiency(1.1)
expect { subject }.to change { migration.reload.batch_size }.to(1_000)
end
end
end
end
......@@ -47,4 +47,55 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedJob, type: :model d
end
end
end
describe '#time_efficiency' do
subject { job.time_efficiency }
let(:migration) { build(:batched_background_migration, interval: 120.seconds) }
let(:job) { build(:batched_background_migration_job, status: :succeeded, batched_migration: migration) }
context 'when job has not yet succeeded' do
let(:job) { build(:batched_background_migration_job, status: :running) }
it 'returns nil' do
expect(subject).to be_nil
end
end
context 'when finished_at is not set' do
it 'returns nil' do
job.started_at = Time.zone.now
expect(subject).to be_nil
end
end
context 'when started_at is not set' do
it 'returns nil' do
job.finished_at = Time.zone.now
expect(subject).to be_nil
end
end
context 'when job has finished' do
it 'returns ratio of duration to interval, here: 0.5' do
freeze_time do
job.started_at = Time.zone.now - migration.interval / 2
job.finished_at = Time.zone.now
expect(subject).to eq(0.5)
end
end
it 'returns ratio of duration to interval, here: 1' do
freeze_time do
job.started_at = Time.zone.now - migration.interval
job.finished_at = Time.zone.now
expect(subject).to eq(1)
end
end
end
end
end
......@@ -50,6 +50,15 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedMigrationRunner do
batch_size: migration.batch_size,
sub_batch_size: migration.sub_batch_size)
end
it 'optimizes the migration after executing the job' do
migration.update!(min_value: event1.id, max_value: event2.id)
expect(migration_wrapper).to receive(:perform).ordered
expect(migration).to receive(:optimize!).ordered
runner.run_migration_job(migration)
end
end
context 'when the batch maximum exceeds the migration maximum' do
......
......@@ -232,4 +232,96 @@ RSpec.describe Gitlab::Database::BackgroundMigration::BatchedMigration, type: :m
expect(batched_migration.prometheus_labels).to eq(labels)
end
end
describe '#smoothed_time_efficiency' do
let(:migration) { create(:batched_background_migration, interval: 120.seconds) }
let(:end_time) { Time.zone.now }
around do |example|
freeze_time do
example.run
end
end
let(:common_attrs) do
{
status: :succeeded,
batched_migration: migration,
finished_at: end_time
}
end
context 'when there are not enough jobs' do
subject { migration.smoothed_time_efficiency(number_of_jobs: 10) }
it 'returns nil' do
create_list(:batched_background_migration_job, 9, **common_attrs)
expect(subject).to be_nil
end
end
context 'when there are enough jobs' do
subject { migration.smoothed_time_efficiency(number_of_jobs: number_of_jobs) }
let!(:jobs) { create_list(:batched_background_migration_job, number_of_jobs, **common_attrs.merge(batched_migration: migration)) }
let(:number_of_jobs) { 10 }
before do
expect(migration).to receive_message_chain(:last_jobs, :succeeded, :limit).with(no_args).with(no_args).with(number_of_jobs).and_return(jobs)
end
def mock_efficiencies(*effs)
effs.each_with_index do |eff, i|
expect(jobs[i]).to receive(:time_efficiency).and_return(eff)
end
end
context 'example 1: increasing trend, but only recently crossed threshold' do
it 'returns the smoothed time efficiency' do
mock_efficiencies(1.1, 1, 0.95, 0.9, 0.8, 0.95, 0.9, 0.8, 0.9, 0.95)
expect(subject).to be_within(0.05).of(0.95)
end
end
context 'example 2: increasing trend, crossed threshold a while ago' do
it 'returns the smoothed time efficiency' do
mock_efficiencies(1.2, 1.1, 1, 1, 1.1, 1, 0.95, 0.9, 0.95, 0.9)
expect(subject).to be_within(0.05).of(1.1)
end
end
context 'example 3: decreasing trend, but only recently crossed threshold' do
it 'returns the smoothed time efficiency' do
mock_efficiencies(0.9, 0.95, 1, 1.2, 1.1, 1.2, 1.1, 1.0, 1.1, 1.0)
expect(subject).to be_within(0.05).of(1.0)
end
end
context 'example 4: latest run spiked' do
it 'returns the smoothed time efficiency' do
mock_efficiencies(1.2, 0.9, 0.8, 0.9, 0.95, 0.9, 0.92, 0.9, 0.95, 0.9)
expect(subject).to be_within(0.02).of(0.96)
end
end
end
end
describe '#optimize!' do
subject { batched_migration.optimize! }
let(:batched_migration) { create(:batched_background_migration) }
let(:optimizer) { instance_double('Gitlab::Database::BackgroundMigration::BatchOptimizer') }
it 'calls the BatchOptimizer' do
expect(Gitlab::Database::BackgroundMigration::BatchOptimizer).to receive(:new).with(batched_migration).and_return(optimizer)
expect(optimizer).to receive(:optimize!)
subject
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment