Delete unused registries

This is one step forward to use the tracking database
as the single source of truth.
parent fc39da08
......@@ -49,8 +49,16 @@ module Geo
#
# @return [Array] the first element is an Array of untracked IDs, and the second element is an Array of tracked IDs that are unused
def find_registry_differences(range)
source_ids = job_artifacts(fdw: false).where(id: range).pluck(::Ci::JobArtifact.arel_table[:id]) # rubocop:disable CodeReuse/ActiveRecord
tracked_ids = Geo::JobArtifactRegistry.pluck_model_ids_in_range(range)
# rubocop:disable CodeReuse/ActiveRecord
source_ids =
job_artifacts(fdw: false)
.id_in(range)
.pluck(::Ci::JobArtifact.arel_table[:id])
# rubocop:enable CodeReuse/ActiveRecord
tracked_ids =
Geo::JobArtifactRegistry
.pluck_model_ids_in_range(range)
untracked_ids = source_ids - tracked_ids
unused_tracked_ids = tracked_ids - source_ids
......
......@@ -49,8 +49,14 @@ module Geo
#
# @return [Array] the first element is an Array of untracked IDs, and the second element is an Array of tracked IDs that are unused
def find_registry_differences(range)
source_ids = lfs_objects(fdw: false).where(id: range).pluck_primary_key # rubocop:disable CodeReuse/ActiveRecord
tracked_ids = Geo::LfsObjectRegistry.pluck_model_ids_in_range(range)
source_ids =
lfs_objects(fdw: false)
.id_in(range)
.pluck_primary_key
tracked_ids =
Geo::LfsObjectRegistry
.pluck_model_ids_in_range(range)
untracked_ids = source_ids - tracked_ids
unused_tracked_ids = tracked_ids - source_ids
......
......@@ -25,6 +25,10 @@ class Geo::JobArtifactRegistry < Geo::BaseRegistry
::Geo::JobArtifactRegistryFinder
end
def self.delete_worker_class
::Geo::FileRegistryRemovalWorker
end
# When false, RegistryConsistencyService will frequently check the end of the
# table to quickly handle new replicables.
def self.has_create_events?
......@@ -33,17 +37,17 @@ class Geo::JobArtifactRegistry < Geo::BaseRegistry
# TODO: remove once `success` column has a default value set
# https://gitlab.com/gitlab-org/gitlab/-/issues/214407
def self.insert_for_model_ids(attrs)
records = attrs.map do |artifact_id, _|
def self.insert_for_model_ids(artifact_ids)
records = artifact_ids.map do |artifact_id|
new(artifact_id: artifact_id, success: false, created_at: Time.zone.now)
end
bulk_insert!(records, returns: :ids)
end
def self.delete_for_model_ids(attrs)
records = attrs.map do |artifact_id, _|
::Geo::FileRegistryRemovalWorker.perform_async(:job_artifact, artifact_id) # rubocop:disable CodeReuse/Worker
def self.delete_for_model_ids(artifact_ids)
artifact_ids.map do |artifact_id|
delete_worker_class.perform_async(:job_artifact, artifact_id)
end
end
......
......@@ -21,15 +21,19 @@ class Geo::LfsObjectRegistry < Geo::BaseRegistry
::Geo::LfsObjectRegistryFinder
end
def self.delete_worker_class
::Geo::FileRegistryRemovalWorker
end
# If false, RegistryConsistencyService will frequently check the end of the
# table to quickly handle new replicables.
def self.has_create_events?
false
end
def self.delete_for_model_ids(ids)
ids.map do |id|
::Geo::FileRegistryRemovalWorker.perform_async(:lfs, id) # rubocop:disable CodeReuse/Worker
def self.delete_for_model_ids(lfs_object_ids)
lfs_object_ids.map do |lfs_object_id|
delete_worker_class.perform_async(:lfs, lfs_object_id)
end
end
end
......@@ -22,6 +22,10 @@ class Geo::UploadRegistry < Geo::BaseRegistry
::Geo::AttachmentRegistryFinder
end
def self.delete_worker_class
::Geo::FileRegistryRemovalWorker
end
# If false, RegistryConsistencyService will frequently check the end of the
# table to quickly handle new replicables.
def self.has_create_events?
......@@ -38,7 +42,7 @@ class Geo::UploadRegistry < Geo::BaseRegistry
def self.delete_for_model_ids(attrs)
attrs.map do |file_id, file_type|
::Geo::FileRegistryRemovalWorker.perform_async(file_type, file_id) # rubocop:disable CodeReuse/Worker
delete_worker_class.perform_async(file_type, file_id)
end
end
......
......@@ -14,15 +14,21 @@ module Geo
@batch_size = batch_size
end
# @return [Boolean] whether at least one registry has been created or deleted in range
def execute
# There are some edge cases to handle here:
#
# 1. When there are unused registries, but there no replicable records next_range! returns nil;
# 2. When the unused registry foreign key ids are greater than the last replicable record id;
# 3. When the unused registry foreign key ids are lower than the first replicable record id;
#
range = next_range!
return unless range
created_in_range = create_missing_in_range(range)
created_above = create_missing_above(end_of_batch: range.last)
created_in_range, deleted_in_range = handle_differences_in_range(range)
created_above, deleted_above = create_missing_above(end_of_batch: range.last)
created_in_range.any? ||
created_above.any?
[created_in_range, deleted_in_range, created_above, deleted_above].flatten.compact.any?
rescue => e
log_error("Error while backfilling #{registry_class}", e)
......@@ -40,24 +46,38 @@ module Geo
"registry_consistency:#{registry_class.name.parameterize}"
end
# @return [Array] the list of IDs of created records
def create_missing_in_range(range)
untracked, _ = find_registry_differences(range)
return [] if untracked.empty?
def find_registry_differences(range)
finder.find_registry_differences(range)
end
def finder
@finder ||= registry_class.finder_class.new(current_node_id: Gitlab::Geo.current_node.id)
end
created = registry_class.insert_for_model_ids(untracked)
def handle_differences_in_range(range)
untracked, unused = find_registry_differences(range)
log_created(range, untracked, created)
created_in_range = create_untracked_in_range(untracked)
log_created(range, untracked, created_in_range)
created
deleted_in_range = delete_unused_in_range(unused)
log_deleted(range, unused, deleted_in_range)
[created_in_range, deleted_in_range]
end
def find_registry_differences(range)
finder.find_registry_differences(range)
# @return [Array] the list of IDs of created records
def create_untracked_in_range(untracked)
return [] if untracked.empty?
registry_class.insert_for_model_ids(untracked)
end
def finder
@finder ||= registry_class.finder_class.new(current_node_id: Gitlab::Geo.current_node.id)
# @return [Array] the list of IDs of deleted records
def delete_unused_in_range(delete_unused_in_range)
return [] if delete_unused_in_range.empty?
registry_class.delete_for_model_ids(delete_unused_in_range)
end
# This hack is used to sync new files soon after they are created.
......@@ -65,7 +85,7 @@ module Geo
# This is not needed for replicables that have already implemented
# create events.
#
# @param [Integer] the last ID of the batch processed in create_missing_in_range
# @param [Integer] the last ID of the batch processed in create_untracked_in_range
# @return [Array] the list of IDs of created records
def create_missing_above(end_of_batch:)
return [] if registry_class.has_create_events?
......@@ -81,7 +101,7 @@ module Geo
start = last_id - batch_size + 1
finish = last_id
create_missing_in_range(start..finish)
handle_differences_in_range(start..finish)
end
# Returns true when LoopingBatcher will soon return ranges near the end of
......@@ -104,5 +124,18 @@ module Geo
}
)
end
def log_deleted(range, unused, deleted)
log_info(
"Deleted registry entries",
{
registry_class: registry_class.name,
start: range.first,
finish: range.last,
deleted: deleted.length,
failed_to_delete: unused.length - deleted.length
}
)
end
end
end
......@@ -2,22 +2,25 @@
require 'spec_helper'
describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_store_caching do
describe Geo::RegistryConsistencyService, :geo, :use_clean_rails_memory_store_caching do
include EE::GeoHelpers
let(:secondary) { create(:geo_node) }
subject { described_class.new(registry_class, batch_size: batch_size) }
before do
stub_current_geo_node(secondary)
end
::Geo::Secondary::RegistryConsistencyWorker::REGISTRY_CLASSES.each do |klass|
let(:registry_class) { klass }
let(:registry_class_factory) { registry_class.underscore.tr('/', '_').to_sym }
let(:model_class) { registry_class::MODEL_CLASS }
let(:model_class_factory) { model_class.underscore.tr('/', '_').to_sym }
let(:model_foreign_key) { registry_class::MODEL_FOREIGN_KEY }
let(:batch_size) { 2 }
subject { described_class.new(registry_class, batch_size: batch_size) }
describe 'registry_class interface' do
it 'defines a MODEL_CLASS constant' do
expect(registry_class::MODEL_CLASS).not_to be_nil
......@@ -31,6 +34,10 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
expect(registry_class).to respond_to(:insert_for_model_ids)
end
it 'responds to .delete_for_model_ids' do
expect(registry_class).to respond_to(:delete_for_model_ids)
end
it 'responds to .finder_class' do
expect(registry_class).to respond_to(:finder_class)
end
......@@ -42,7 +49,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
describe '#execute' do
context 'when there are replicable records missing registries' do
let!(:expected_batch) { create_list(model_class.underscore.to_sym, batch_size) }
let!(:expected_batch) { create_list(model_class_factory, batch_size) }
it 'creates missing registries' do
expect do
......@@ -55,7 +62,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end
it 'does not exceed batch size' do
not_expected = create(model_class.underscore.to_sym)
not_expected = create(model_class_factory)
subject.execute
......@@ -64,7 +71,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
# Temporarily, until we implement create events for these replicables
context 'when the number of records is greater than 6 batches' do
let!(:five_batches_worth) { create_list(model_class.underscore.to_sym, 5 * batch_size) }
let!(:five_batches_worth) { create_list(model_class_factory, 5 * batch_size) }
context 'when the previous batch is greater than 5 batches from the end of the table' do
context 'when create events are implemented for this replicable' do
......@@ -82,8 +89,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
expect(registry_class.model_id_in(expected).count).to eq(2)
end
it 'calls #create_missing_in_range only once' do
expect(subject).to receive(:create_missing_in_range).once.and_call_original
it 'calls #handle_differences_in_range only once' do
expect(subject).to receive(:handle_differences_in_range).once.and_call_original
subject.execute
end
......@@ -104,8 +111,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
expect(registry_class.model_id_in(expected).count).to eq(4)
end
it 'calls #create_missing_in_range twice' do
expect(subject).to receive(:create_missing_in_range).twice.and_call_original
it 'calls #handle_differences_in_range twice' do
expect(subject).to receive(:handle_differences_in_range).twice.and_call_original
subject.execute
end
......@@ -124,8 +131,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end.to change { registry_class.count }.by(batch_size)
end
it 'calls #create_missing_in_range once' do
expect(subject).to receive(:create_missing_in_range).once.and_call_original
it 'calls #handle_differences_in_range once' do
expect(subject).to receive(:handle_differences_in_range).once.and_call_original
subject.execute
end
......@@ -133,17 +140,42 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end
context 'when the number of records is less than 6 batches' do
it 'calls #create_missing_in_range once' do
expect(subject).to receive(:create_missing_in_range).once.and_call_original
it 'calls #handle_differences_in_range once' do
expect(subject).to receive(:handle_differences_in_range).once.and_call_original
subject.execute
end
end
end
context 'when there are unused registries' do
let(:records) { create_list(model_class_factory, batch_size) }
let(:unused_registry_ids) { [records.first].map(&:id) }
let!(:registries) do
records.map do |record|
create(registry_class_factory, model_foreign_key => record.id)
end
end
before do
model_class.where(id: unused_registry_ids).delete_all
end
it 'deletes unused registries', :sidekiq_inline do
subject.execute
expect(registry_class.where(model_foreign_key => unused_registry_ids)).to be_empty
end
it 'returns truthy' do
expect(subject.execute).to be_truthy
end
end
context 'when all replicable records have registries' do
it 'does nothing' do
create_list(model_class.underscore.to_sym, batch_size)
create_list(model_class_factory, batch_size)
subject.execute # create the missing registries
......@@ -153,7 +185,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end
it 'returns falsey' do
create_list(model_class.underscore.to_sym, batch_size)
create_list(model_class_factory, batch_size)
subject.execute # create the missing registries
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment