Commit 446ebbc9 authored by Andreas Brandl's avatar Andreas Brandl Committed by Yannis Roussos

Add queuing mechanic for reindexing

This adds an explicit queuing mechanis for reindexing. The rake task
works as follows:

1. Trigger async index creation if necessary (still a hack)
1. Cleanup leftovers from previous run
1. Consume from the queue if non-empty
1. Execute heuristic to choose indexes for regular reindexing

Relates to https://gitlab.com/gitlab-org/gitlab/-/issues/330922

Changelog: added
parent 7c0d178a
# frozen_string_literal: true
class AddReindexingQueue < Gitlab::Database::Migration[1.0]
def change
create_table :postgres_reindex_queued_actions do |t|
t.text :index_identifier, null: false, limit: 255
t.integer :state, limit: 2, null: false, default: 0
t.timestamps_with_timezone null: false
t.index :state
end
change_column_default :postgres_reindex_queued_actions, :created_at, from: nil, to: -> { 'NOW()' }
change_column_default :postgres_reindex_queued_actions, :updated_at, from: nil, to: -> { 'NOW()' }
end
end
3e02605ce307d0ce37c3830e6909e7cfe5632408a757adf59209a70da92c0bc6
\ No newline at end of file
......@@ -17856,6 +17856,24 @@ CREATE SEQUENCE postgres_reindex_actions_id_seq
ALTER SEQUENCE postgres_reindex_actions_id_seq OWNED BY postgres_reindex_actions.id;
CREATE TABLE postgres_reindex_queued_actions (
id bigint NOT NULL,
index_identifier text NOT NULL,
state smallint DEFAULT 0 NOT NULL,
created_at timestamp with time zone DEFAULT now() NOT NULL,
updated_at timestamp with time zone DEFAULT now() NOT NULL,
CONSTRAINT check_e4b01395c0 CHECK ((char_length(index_identifier) <= 255))
);
CREATE SEQUENCE postgres_reindex_queued_actions_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
ALTER SEQUENCE postgres_reindex_queued_actions_id_seq OWNED BY postgres_reindex_queued_actions.id;
CREATE TABLE programming_languages (
id integer NOT NULL,
name character varying NOT NULL,
......@@ -21756,6 +21774,8 @@ ALTER TABLE ONLY postgres_async_indexes ALTER COLUMN id SET DEFAULT nextval('pos
ALTER TABLE ONLY postgres_reindex_actions ALTER COLUMN id SET DEFAULT nextval('postgres_reindex_actions_id_seq'::regclass);
ALTER TABLE ONLY postgres_reindex_queued_actions ALTER COLUMN id SET DEFAULT nextval('postgres_reindex_queued_actions_id_seq'::regclass);
ALTER TABLE ONLY product_analytics_events_experimental ALTER COLUMN id SET DEFAULT nextval('product_analytics_events_experimental_id_seq'::regclass);
ALTER TABLE ONLY programming_languages ALTER COLUMN id SET DEFAULT nextval('programming_languages_id_seq'::regclass);
......@@ -23583,6 +23603,9 @@ ALTER TABLE ONLY postgres_async_indexes
ALTER TABLE ONLY postgres_reindex_actions
ADD CONSTRAINT postgres_reindex_actions_pkey PRIMARY KEY (id);
ALTER TABLE ONLY postgres_reindex_queued_actions
ADD CONSTRAINT postgres_reindex_queued_actions_pkey PRIMARY KEY (id);
ALTER TABLE ONLY programming_languages
ADD CONSTRAINT programming_languages_pkey PRIMARY KEY (id);
......@@ -26264,6 +26287,8 @@ CREATE UNIQUE INDEX index_postgres_async_indexes_on_name ON postgres_async_index
CREATE INDEX index_postgres_reindex_actions_on_index_identifier ON postgres_reindex_actions USING btree (index_identifier);
CREATE INDEX index_postgres_reindex_queued_actions_on_state ON postgres_reindex_queued_actions USING btree (state);
CREATE UNIQUE INDEX index_programming_languages_on_name ON programming_languages USING btree (name);
CREATE INDEX index_project_access_tokens_on_project_id ON project_access_tokens USING btree (project_id);
......@@ -381,6 +381,7 @@ postgres_indexes: :gitlab_shared
postgres_partitioned_tables: :gitlab_shared
postgres_partitions: :gitlab_shared
postgres_reindex_actions: :gitlab_shared
postgres_reindex_queued_actions: :gitlab_main
product_analytics_events_experimental: :gitlab_main
programming_languages: :gitlab_main
project_access_tokens: :gitlab_main
......
......@@ -11,6 +11,7 @@ module Gitlab
has_one :bloat_estimate, class_name: 'Gitlab::Database::PostgresIndexBloatEstimate', foreign_key: :identifier
has_many :reindexing_actions, class_name: 'Gitlab::Database::Reindexing::ReindexAction', foreign_key: :index_identifier
has_many :queued_reindexing_actions, class_name: 'Gitlab::Database::Reindexing::QueuedAction', foreign_key: :index_identifier
scope :by_identifier, ->(identifier) do
raise ArgumentError, "Index name is not fully qualified with a schema: #{identifier}" unless identifier =~ /^\w+\.\w+$/
......
......@@ -15,13 +15,45 @@ module Gitlab
# on e.g. vacuum.
REMOVE_INDEX_RETRY_CONFIG = [[1.minute, 9.minutes]] * 30
# candidate_indexes: Array of Gitlab::Database::PostgresIndex
def self.perform(candidate_indexes, how_many: DEFAULT_INDEXES_PER_INVOCATION)
IndexSelection.new(candidate_indexes).take(how_many).each do |index|
# Performs automatic reindexing for a limited number of indexes per call
# 1. Consume from the explicit reindexing queue
# 2. Apply bloat heuristic to find most bloated indexes and reindex those
def self.automatic_reindexing(maximum_records: DEFAULT_INDEXES_PER_INVOCATION)
# Cleanup leftover temporary indexes from previous, possibly aborted runs (if any)
cleanup_leftovers!
# Consume from the explicit reindexing queue first
done_counter = perform_from_queue(maximum_records: maximum_records)
return if done_counter >= maximum_records
# Execute reindexing based on bloat heuristic
perform_with_heuristic(maximum_records: maximum_records - done_counter)
end
# Reindex based on bloat heuristic for a limited number of indexes per call
#
# We use a bloat heuristic to estimate the index bloat and pick the
# most bloated indexes for reindexing.
def self.perform_with_heuristic(candidate_indexes = Gitlab::Database::PostgresIndex.reindexing_support, maximum_records: DEFAULT_INDEXES_PER_INVOCATION)
IndexSelection.new(candidate_indexes).take(maximum_records).each do |index|
Coordinator.new(index).perform
end
end
# Reindex indexes that have been explicitly enqueued (for a limited number of indexes per call)
def self.perform_from_queue(maximum_records: DEFAULT_INDEXES_PER_INVOCATION)
QueuedAction.in_queue_order.limit(maximum_records).each do |queued_entry|
Coordinator.new(queued_entry.index).perform
queued_entry.done!
rescue StandardError => e
queued_entry.failed!
Gitlab::AppLogger.error("Failed to perform reindexing action on queued entry #{queued_entry}: #{e}")
end.size
end
def self.cleanup_leftovers!
PostgresIndex.reindexing_leftovers.each do |index|
Gitlab::AppLogger.info("Removing index #{index.identifier} which is a leftover, temporary index from previous reindexing activity")
......
# frozen_string_literal: true
module Gitlab
module Database
module Reindexing
class QueuedAction < SharedModel
self.table_name = 'postgres_reindex_queued_actions'
enum state: { queued: 0, done: 1, failed: 2 }
belongs_to :index, foreign_key: :index_identifier, class_name: 'Gitlab::Database::PostgresIndex'
scope :in_queue_order, -> { queued.order(:created_at) }
def to_s
"queued action [ id = #{id}, index: #{index_identifier} ]"
end
end
end
end
end
......@@ -160,39 +160,46 @@ namespace :gitlab do
Rake::Task['gitlab:db:create_dynamic_partitions'].invoke
end
desc 'reindex a regular index without downtime to eliminate bloat'
task :reindex, [:index_name, :database] => :environment do |_, args|
unless Feature.enabled?(:database_reindexing, type: :ops)
desc 'execute reindexing without downtime to eliminate bloat'
task reindex: :environment do
unless Feature.enabled?(:database_reindexing, type: :ops, default_enabled: :yaml)
puts "This feature (database_reindexing) is currently disabled.".color(:yellow)
exit
end
Gitlab::Database::EachDatabase.each_database_connection do |connection, connection_name|
indexes = Gitlab::Database::PostgresIndex.reindexing_support
if (identifier = args[:index_name]) && (args.fetch(:database, 'main') == connection_name)
raise ArgumentError, "Index name is not fully qualified with a schema: #{identifier}" unless identifier =~ /^\w+\.\w+$/
indexes = indexes.where(identifier: identifier)
raise "Index #{args[:index_name]} for #{connection_name} database not found or not supported" if indexes.empty?
end
Gitlab::Database::SharedModel.logger = Logger.new($stdout) if Gitlab::Utils.to_boolean(ENV['LOG_QUERIES_TO_CONSOLE'], default: false)
# Cleanup leftover temporary indexes from previous, possibly aborted runs (if any)
Gitlab::Database::Reindexing.cleanup_leftovers!
# Hack: Before we do actual reindexing work, create async indexes
Gitlab::Database::AsyncIndexes.create_pending_indexes! if Feature.enabled?(:database_async_index_creation, type: :ops)
Gitlab::Database::Reindexing.perform(indexes)
Gitlab::Database::Reindexing.automatic_reindexing
end
rescue StandardError => e
Gitlab::AppLogger.error(e)
raise
end
desc 'Enqueue an index for reindexing'
task :enqueue_reindexing_action, [:index_name, :database] => :environment do |_, args|
connection = Gitlab::Database.databases[args.fetch(:database, Gitlab::Database::PRIMARY_DATABASE_NAME)]
Gitlab::Database::SharedModel.using_connection(connection.scope.connection) do
queued_action = Gitlab::Database::PostgresIndex.find(args[:index_name]).queued_reindexing_actions.create!
puts "Queued reindexing action: #{queued_action}"
puts "There are #{Gitlab::Database::Reindexing::QueuedAction.queued.size} queued actions in total."
end
unless Feature.enabled?(:database_reindexing, type: :ops, default_enabled: :yaml)
puts <<~NOTE.color(:yellow)
Note: database_reindexing feature is currently disabled.
Enable with: Feature.enable(:database_reindexing)
NOTE
end
end
desc 'Check if there have been user additions to the database'
task active: :environment do
if ActiveRecord::Base.connection.migration_context.needs_migration?
......
# frozen_string_literal: true
FactoryBot.define do
factory :reindexing_queued_action, class: 'Gitlab::Database::Reindexing::QueuedAction' do
association :index, factory: :postgres_index
state { Gitlab::Database::Reindexing::QueuedAction.states[:queued] }
index_identifier { index.identifier }
end
end
......@@ -4,10 +4,63 @@ require 'spec_helper'
RSpec.describe Gitlab::Database::Reindexing do
include ExclusiveLeaseHelpers
include Database::DatabaseHelpers
describe '.perform' do
subject { described_class.perform(candidate_indexes) }
describe '.automatic_reindexing' do
subject { described_class.automatic_reindexing(maximum_records: limit) }
let(:limit) { 5 }
before_all do
swapout_view_for_table(:postgres_indexes)
end
before do
allow(Gitlab::Database::Reindexing).to receive(:cleanup_leftovers!)
allow(Gitlab::Database::Reindexing).to receive(:perform_from_queue).and_return(0)
allow(Gitlab::Database::Reindexing).to receive(:perform_with_heuristic).and_return(0)
end
it 'cleans up leftovers, before consuming the queue' do
expect(Gitlab::Database::Reindexing).to receive(:cleanup_leftovers!).ordered
expect(Gitlab::Database::Reindexing).to receive(:perform_from_queue).ordered
subject
end
context 'with records in the queue' do
before do
create(:reindexing_queued_action)
end
context 'with enough records in the queue to reach limit' do
let(:limit) { 1 }
it 'does not perform reindexing with heuristic' do
expect(Gitlab::Database::Reindexing).to receive(:perform_from_queue).and_return(limit)
expect(Gitlab::Database::Reindexing).not_to receive(:perform_with_heuristic)
subject
end
end
context 'without enough records in the queue to reach limit' do
let(:limit) { 2 }
it 'continues if the queue did not have enough records' do
expect(Gitlab::Database::Reindexing).to receive(:perform_from_queue).ordered.and_return(1)
expect(Gitlab::Database::Reindexing).to receive(:perform_with_heuristic).with(maximum_records: 1).ordered
subject
end
end
end
end
describe '.perform_with_heuristic' do
subject { described_class.perform_with_heuristic(candidate_indexes, maximum_records: limit) }
let(:limit) { 2 }
let(:coordinator) { instance_double(Gitlab::Database::Reindexing::Coordinator) }
let(:index_selection) { instance_double(Gitlab::Database::Reindexing::IndexSelection) }
let(:candidate_indexes) { double }
......@@ -15,7 +68,7 @@ RSpec.describe Gitlab::Database::Reindexing do
it 'delegates to Coordinator' do
expect(Gitlab::Database::Reindexing::IndexSelection).to receive(:new).with(candidate_indexes).and_return(index_selection)
expect(index_selection).to receive(:take).with(2).and_return(indexes)
expect(index_selection).to receive(:take).with(limit).and_return(indexes)
indexes.each do |index|
expect(Gitlab::Database::Reindexing::Coordinator).to receive(:new).with(index).and_return(coordinator)
......@@ -26,6 +79,59 @@ RSpec.describe Gitlab::Database::Reindexing do
end
end
describe '.perform_from_queue' do
subject { described_class.perform_from_queue(maximum_records: limit) }
before_all do
swapout_view_for_table(:postgres_indexes)
end
let(:limit) { 2 }
let(:queued_actions) { create_list(:reindexing_queued_action, 3) }
let(:coordinator) { instance_double(Gitlab::Database::Reindexing::Coordinator) }
before do
queued_actions.take(limit).each do |action|
allow(Gitlab::Database::Reindexing::Coordinator).to receive(:new).with(action.index).and_return(coordinator)
allow(coordinator).to receive(:perform)
end
end
it 'consumes the queue in order of created_at and applies the limit' do
queued_actions.take(limit).each do |action|
expect(Gitlab::Database::Reindexing::Coordinator).to receive(:new).ordered.with(action.index).and_return(coordinator)
expect(coordinator).to receive(:perform)
end
subject
end
it 'updates queued action and sets state to done' do
subject
queue = queued_actions
queue.shift(limit).each do |action|
expect(action.reload.state).to eq('done')
end
queue.each do |action|
expect(action.reload.state).to eq('queued')
end
end
it 'updates queued action upon error and sets state to failed' do
expect(Gitlab::Database::Reindexing::Coordinator).to receive(:new).ordered.with(queued_actions.first.index).and_return(coordinator)
expect(coordinator).to receive(:perform).and_raise('something went wrong')
subject
states = queued_actions.map(&:reload).map(&:state)
expect(states).to eq(%w(failed done queued))
end
end
describe '.cleanup_leftovers!' do
subject { described_class.cleanup_leftovers! }
......
......@@ -215,7 +215,7 @@ RSpec.describe 'gitlab:db namespace rake task', :silence_stdout do
stub_feature_flags(database_async_index_creation: true)
expect(Gitlab::Database::AsyncIndexes).to receive(:create_pending_indexes!).ordered.exactly(databases_count).times
expect(Gitlab::Database::Reindexing).to receive(:perform).ordered.exactly(databases_count).times
expect(Gitlab::Database::Reindexing).to receive(:automatic_reindexing).ordered.once
run_rake_task('gitlab:db:reindex')
end
......@@ -231,56 +231,30 @@ RSpec.describe 'gitlab:db namespace rake task', :silence_stdout do
end
end
context 'when no index_name is given' do
context 'calls automatic reindexing' do
it 'uses all candidate indexes' do
expect(Gitlab::Database::PostgresIndex).to receive(:reindexing_support).exactly(databases_count).times.and_return(indexes)
expect(Gitlab::Database::Reindexing).to receive(:perform).with(indexes).exactly(databases_count).times
expect(Gitlab::Database::Reindexing).to receive(:automatic_reindexing).once
run_rake_task('gitlab:db:reindex')
end
end
end
context 'with index name given' do
let(:index) { double('index') }
before do
allow(Gitlab::Database::PostgresIndex).to receive(:reindexing_support).and_return(indexes)
end
it 'calls the index rebuilder with the proper arguments' do
allow(indexes).to receive(:where).with(identifier: 'public.foo_idx').and_return([index])
expect(Gitlab::Database::Reindexing).to receive(:perform).with([index]).ordered
expect(Gitlab::Database::Reindexing).to receive(:perform).with(indexes).ordered if databases.many?
run_rake_task('gitlab:db:reindex', '[public.foo_idx]')
end
context 'when database name is provided' do
it 'calls the index rebuilder with the proper arguments when the database name match' do
allow(indexes).to receive(:where).with(identifier: 'public.foo_idx').and_return([index])
expect(Gitlab::Database::Reindexing).to receive(:perform).with([index]).ordered
expect(Gitlab::Database::Reindexing).to receive(:perform).with(indexes).ordered if databases.many?
run_rake_task('gitlab:db:reindex', '[public.foo_idx,main]')
end
it 'ignores the index and uses all candidate indexes if database name does not match' do
expect(Gitlab::Database::PostgresIndex).to receive(:reindexing_support).exactly(databases_count).times.and_return(indexes)
expect(Gitlab::Database::Reindexing).to receive(:perform).with(indexes).exactly(databases_count).times
run_rake_task('gitlab:db:reindex', '[public.foo_idx,no_such_database]')
end
end
describe 'enqueue_reindexing_action' do
let(:index_name) { 'public.users_pkey' }
it 'raises an error if the index does not exist' do
allow(indexes).to receive(:where).with(identifier: 'public.absent_index').and_return([])
it 'creates an entry in the queue' do
expect do
run_rake_task('gitlab:db:enqueue_reindexing_action', "[#{index_name}, main]")
end.to change { Gitlab::Database::PostgresIndex.find(index_name).queued_reindexing_actions.size }.from(0).to(1)
end
expect { run_rake_task('gitlab:db:reindex', '[public.absent_index]') }.to raise_error(/Index public.absent_index for main database not found or not supported/)
end
it 'defaults to main database' do
expect(Gitlab::Database::SharedModel).to receive(:using_connection).with(Gitlab::Database.main.scope.connection).and_call_original
it 'raises an error if the index is not fully qualified with a schema' do
expect { run_rake_task('gitlab:db:reindex', '[foo_idx]') }.to raise_error(/Index name is not fully qualified/)
end
expect do
run_rake_task('gitlab:db:enqueue_reindexing_action', "[#{index_name}]")
end.to change { Gitlab::Database::PostgresIndex.find(index_name).queued_reindexing_actions.size }.from(0).to(1)
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment