Delete unused registries

This is one step forward to use the tracking database
as the single source of truth.
parent fc39da08
...@@ -49,8 +49,16 @@ module Geo ...@@ -49,8 +49,16 @@ module Geo
# #
# @return [Array] the first element is an Array of untracked IDs, and the second element is an Array of tracked IDs that are unused # @return [Array] the first element is an Array of untracked IDs, and the second element is an Array of tracked IDs that are unused
def find_registry_differences(range) def find_registry_differences(range)
source_ids = job_artifacts(fdw: false).where(id: range).pluck(::Ci::JobArtifact.arel_table[:id]) # rubocop:disable CodeReuse/ActiveRecord # rubocop:disable CodeReuse/ActiveRecord
tracked_ids = Geo::JobArtifactRegistry.pluck_model_ids_in_range(range) source_ids =
job_artifacts(fdw: false)
.id_in(range)
.pluck(::Ci::JobArtifact.arel_table[:id])
# rubocop:enable CodeReuse/ActiveRecord
tracked_ids =
Geo::JobArtifactRegistry
.pluck_model_ids_in_range(range)
untracked_ids = source_ids - tracked_ids untracked_ids = source_ids - tracked_ids
unused_tracked_ids = tracked_ids - source_ids unused_tracked_ids = tracked_ids - source_ids
......
...@@ -49,8 +49,14 @@ module Geo ...@@ -49,8 +49,14 @@ module Geo
# #
# @return [Array] the first element is an Array of untracked IDs, and the second element is an Array of tracked IDs that are unused # @return [Array] the first element is an Array of untracked IDs, and the second element is an Array of tracked IDs that are unused
def find_registry_differences(range) def find_registry_differences(range)
source_ids = lfs_objects(fdw: false).where(id: range).pluck_primary_key # rubocop:disable CodeReuse/ActiveRecord source_ids =
tracked_ids = Geo::LfsObjectRegistry.pluck_model_ids_in_range(range) lfs_objects(fdw: false)
.id_in(range)
.pluck_primary_key
tracked_ids =
Geo::LfsObjectRegistry
.pluck_model_ids_in_range(range)
untracked_ids = source_ids - tracked_ids untracked_ids = source_ids - tracked_ids
unused_tracked_ids = tracked_ids - source_ids unused_tracked_ids = tracked_ids - source_ids
......
...@@ -25,6 +25,10 @@ class Geo::JobArtifactRegistry < Geo::BaseRegistry ...@@ -25,6 +25,10 @@ class Geo::JobArtifactRegistry < Geo::BaseRegistry
::Geo::JobArtifactRegistryFinder ::Geo::JobArtifactRegistryFinder
end end
def self.delete_worker_class
::Geo::FileRegistryRemovalWorker
end
# When false, RegistryConsistencyService will frequently check the end of the # When false, RegistryConsistencyService will frequently check the end of the
# table to quickly handle new replicables. # table to quickly handle new replicables.
def self.has_create_events? def self.has_create_events?
...@@ -33,17 +37,17 @@ class Geo::JobArtifactRegistry < Geo::BaseRegistry ...@@ -33,17 +37,17 @@ class Geo::JobArtifactRegistry < Geo::BaseRegistry
# TODO: remove once `success` column has a default value set # TODO: remove once `success` column has a default value set
# https://gitlab.com/gitlab-org/gitlab/-/issues/214407 # https://gitlab.com/gitlab-org/gitlab/-/issues/214407
def self.insert_for_model_ids(attrs) def self.insert_for_model_ids(artifact_ids)
records = attrs.map do |artifact_id, _| records = artifact_ids.map do |artifact_id|
new(artifact_id: artifact_id, success: false, created_at: Time.zone.now) new(artifact_id: artifact_id, success: false, created_at: Time.zone.now)
end end
bulk_insert!(records, returns: :ids) bulk_insert!(records, returns: :ids)
end end
def self.delete_for_model_ids(attrs) def self.delete_for_model_ids(artifact_ids)
records = attrs.map do |artifact_id, _| artifact_ids.map do |artifact_id|
::Geo::FileRegistryRemovalWorker.perform_async(:job_artifact, artifact_id) # rubocop:disable CodeReuse/Worker delete_worker_class.perform_async(:job_artifact, artifact_id)
end end
end end
......
...@@ -21,15 +21,19 @@ class Geo::LfsObjectRegistry < Geo::BaseRegistry ...@@ -21,15 +21,19 @@ class Geo::LfsObjectRegistry < Geo::BaseRegistry
::Geo::LfsObjectRegistryFinder ::Geo::LfsObjectRegistryFinder
end end
def self.delete_worker_class
::Geo::FileRegistryRemovalWorker
end
# If false, RegistryConsistencyService will frequently check the end of the # If false, RegistryConsistencyService will frequently check the end of the
# table to quickly handle new replicables. # table to quickly handle new replicables.
def self.has_create_events? def self.has_create_events?
false false
end end
def self.delete_for_model_ids(ids) def self.delete_for_model_ids(lfs_object_ids)
ids.map do |id| lfs_object_ids.map do |lfs_object_id|
::Geo::FileRegistryRemovalWorker.perform_async(:lfs, id) # rubocop:disable CodeReuse/Worker delete_worker_class.perform_async(:lfs, lfs_object_id)
end end
end end
end end
...@@ -22,6 +22,10 @@ class Geo::UploadRegistry < Geo::BaseRegistry ...@@ -22,6 +22,10 @@ class Geo::UploadRegistry < Geo::BaseRegistry
::Geo::AttachmentRegistryFinder ::Geo::AttachmentRegistryFinder
end end
def self.delete_worker_class
::Geo::FileRegistryRemovalWorker
end
# If false, RegistryConsistencyService will frequently check the end of the # If false, RegistryConsistencyService will frequently check the end of the
# table to quickly handle new replicables. # table to quickly handle new replicables.
def self.has_create_events? def self.has_create_events?
...@@ -38,7 +42,7 @@ class Geo::UploadRegistry < Geo::BaseRegistry ...@@ -38,7 +42,7 @@ class Geo::UploadRegistry < Geo::BaseRegistry
def self.delete_for_model_ids(attrs) def self.delete_for_model_ids(attrs)
attrs.map do |file_id, file_type| attrs.map do |file_id, file_type|
::Geo::FileRegistryRemovalWorker.perform_async(file_type, file_id) # rubocop:disable CodeReuse/Worker delete_worker_class.perform_async(file_type, file_id)
end end
end end
......
...@@ -14,15 +14,21 @@ module Geo ...@@ -14,15 +14,21 @@ module Geo
@batch_size = batch_size @batch_size = batch_size
end end
# @return [Boolean] whether at least one registry has been created or deleted in range
def execute def execute
# There are some edge cases to handle here:
#
# 1. When there are unused registries, but there no replicable records next_range! returns nil;
# 2. When the unused registry foreign key ids are greater than the last replicable record id;
# 3. When the unused registry foreign key ids are lower than the first replicable record id;
#
range = next_range! range = next_range!
return unless range return unless range
created_in_range = create_missing_in_range(range) created_in_range, deleted_in_range = handle_differences_in_range(range)
created_above = create_missing_above(end_of_batch: range.last) created_above, deleted_above = create_missing_above(end_of_batch: range.last)
created_in_range.any? || [created_in_range, deleted_in_range, created_above, deleted_above].flatten.compact.any?
created_above.any?
rescue => e rescue => e
log_error("Error while backfilling #{registry_class}", e) log_error("Error while backfilling #{registry_class}", e)
...@@ -40,24 +46,38 @@ module Geo ...@@ -40,24 +46,38 @@ module Geo
"registry_consistency:#{registry_class.name.parameterize}" "registry_consistency:#{registry_class.name.parameterize}"
end end
# @return [Array] the list of IDs of created records def find_registry_differences(range)
def create_missing_in_range(range) finder.find_registry_differences(range)
untracked, _ = find_registry_differences(range) end
return [] if untracked.empty?
def finder
@finder ||= registry_class.finder_class.new(current_node_id: Gitlab::Geo.current_node.id)
end
created = registry_class.insert_for_model_ids(untracked) def handle_differences_in_range(range)
untracked, unused = find_registry_differences(range)
log_created(range, untracked, created) created_in_range = create_untracked_in_range(untracked)
log_created(range, untracked, created_in_range)
created deleted_in_range = delete_unused_in_range(unused)
log_deleted(range, unused, deleted_in_range)
[created_in_range, deleted_in_range]
end end
def find_registry_differences(range) # @return [Array] the list of IDs of created records
finder.find_registry_differences(range) def create_untracked_in_range(untracked)
return [] if untracked.empty?
registry_class.insert_for_model_ids(untracked)
end end
def finder # @return [Array] the list of IDs of deleted records
@finder ||= registry_class.finder_class.new(current_node_id: Gitlab::Geo.current_node.id) def delete_unused_in_range(delete_unused_in_range)
return [] if delete_unused_in_range.empty?
registry_class.delete_for_model_ids(delete_unused_in_range)
end end
# This hack is used to sync new files soon after they are created. # This hack is used to sync new files soon after they are created.
...@@ -65,7 +85,7 @@ module Geo ...@@ -65,7 +85,7 @@ module Geo
# This is not needed for replicables that have already implemented # This is not needed for replicables that have already implemented
# create events. # create events.
# #
# @param [Integer] the last ID of the batch processed in create_missing_in_range # @param [Integer] the last ID of the batch processed in create_untracked_in_range
# @return [Array] the list of IDs of created records # @return [Array] the list of IDs of created records
def create_missing_above(end_of_batch:) def create_missing_above(end_of_batch:)
return [] if registry_class.has_create_events? return [] if registry_class.has_create_events?
...@@ -81,7 +101,7 @@ module Geo ...@@ -81,7 +101,7 @@ module Geo
start = last_id - batch_size + 1 start = last_id - batch_size + 1
finish = last_id finish = last_id
create_missing_in_range(start..finish) handle_differences_in_range(start..finish)
end end
# Returns true when LoopingBatcher will soon return ranges near the end of # Returns true when LoopingBatcher will soon return ranges near the end of
...@@ -104,5 +124,18 @@ module Geo ...@@ -104,5 +124,18 @@ module Geo
} }
) )
end end
def log_deleted(range, unused, deleted)
log_info(
"Deleted registry entries",
{
registry_class: registry_class.name,
start: range.first,
finish: range.last,
deleted: deleted.length,
failed_to_delete: unused.length - deleted.length
}
)
end
end end
end end
...@@ -2,22 +2,25 @@ ...@@ -2,22 +2,25 @@
require 'spec_helper' require 'spec_helper'
describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_store_caching do describe Geo::RegistryConsistencyService, :geo, :use_clean_rails_memory_store_caching do
include EE::GeoHelpers include EE::GeoHelpers
let(:secondary) { create(:geo_node) } let(:secondary) { create(:geo_node) }
subject { described_class.new(registry_class, batch_size: batch_size) }
before do before do
stub_current_geo_node(secondary) stub_current_geo_node(secondary)
end end
::Geo::Secondary::RegistryConsistencyWorker::REGISTRY_CLASSES.each do |klass| ::Geo::Secondary::RegistryConsistencyWorker::REGISTRY_CLASSES.each do |klass|
let(:registry_class) { klass } let(:registry_class) { klass }
let(:registry_class_factory) { registry_class.underscore.tr('/', '_').to_sym }
let(:model_class) { registry_class::MODEL_CLASS } let(:model_class) { registry_class::MODEL_CLASS }
let(:model_class_factory) { model_class.underscore.tr('/', '_').to_sym }
let(:model_foreign_key) { registry_class::MODEL_FOREIGN_KEY }
let(:batch_size) { 2 } let(:batch_size) { 2 }
subject { described_class.new(registry_class, batch_size: batch_size) }
describe 'registry_class interface' do describe 'registry_class interface' do
it 'defines a MODEL_CLASS constant' do it 'defines a MODEL_CLASS constant' do
expect(registry_class::MODEL_CLASS).not_to be_nil expect(registry_class::MODEL_CLASS).not_to be_nil
...@@ -31,6 +34,10 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -31,6 +34,10 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
expect(registry_class).to respond_to(:insert_for_model_ids) expect(registry_class).to respond_to(:insert_for_model_ids)
end end
it 'responds to .delete_for_model_ids' do
expect(registry_class).to respond_to(:delete_for_model_ids)
end
it 'responds to .finder_class' do it 'responds to .finder_class' do
expect(registry_class).to respond_to(:finder_class) expect(registry_class).to respond_to(:finder_class)
end end
...@@ -42,7 +49,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -42,7 +49,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
describe '#execute' do describe '#execute' do
context 'when there are replicable records missing registries' do context 'when there are replicable records missing registries' do
let!(:expected_batch) { create_list(model_class.underscore.to_sym, batch_size) } let!(:expected_batch) { create_list(model_class_factory, batch_size) }
it 'creates missing registries' do it 'creates missing registries' do
expect do expect do
...@@ -55,7 +62,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -55,7 +62,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end end
it 'does not exceed batch size' do it 'does not exceed batch size' do
not_expected = create(model_class.underscore.to_sym) not_expected = create(model_class_factory)
subject.execute subject.execute
...@@ -64,7 +71,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -64,7 +71,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
# Temporarily, until we implement create events for these replicables # Temporarily, until we implement create events for these replicables
context 'when the number of records is greater than 6 batches' do context 'when the number of records is greater than 6 batches' do
let!(:five_batches_worth) { create_list(model_class.underscore.to_sym, 5 * batch_size) } let!(:five_batches_worth) { create_list(model_class_factory, 5 * batch_size) }
context 'when the previous batch is greater than 5 batches from the end of the table' do context 'when the previous batch is greater than 5 batches from the end of the table' do
context 'when create events are implemented for this replicable' do context 'when create events are implemented for this replicable' do
...@@ -82,8 +89,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -82,8 +89,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
expect(registry_class.model_id_in(expected).count).to eq(2) expect(registry_class.model_id_in(expected).count).to eq(2)
end end
it 'calls #create_missing_in_range only once' do it 'calls #handle_differences_in_range only once' do
expect(subject).to receive(:create_missing_in_range).once.and_call_original expect(subject).to receive(:handle_differences_in_range).once.and_call_original
subject.execute subject.execute
end end
...@@ -104,8 +111,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -104,8 +111,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
expect(registry_class.model_id_in(expected).count).to eq(4) expect(registry_class.model_id_in(expected).count).to eq(4)
end end
it 'calls #create_missing_in_range twice' do it 'calls #handle_differences_in_range twice' do
expect(subject).to receive(:create_missing_in_range).twice.and_call_original expect(subject).to receive(:handle_differences_in_range).twice.and_call_original
subject.execute subject.execute
end end
...@@ -124,8 +131,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -124,8 +131,8 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end.to change { registry_class.count }.by(batch_size) end.to change { registry_class.count }.by(batch_size)
end end
it 'calls #create_missing_in_range once' do it 'calls #handle_differences_in_range once' do
expect(subject).to receive(:create_missing_in_range).once.and_call_original expect(subject).to receive(:handle_differences_in_range).once.and_call_original
subject.execute subject.execute
end end
...@@ -133,17 +140,42 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -133,17 +140,42 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end end
context 'when the number of records is less than 6 batches' do context 'when the number of records is less than 6 batches' do
it 'calls #create_missing_in_range once' do it 'calls #handle_differences_in_range once' do
expect(subject).to receive(:create_missing_in_range).once.and_call_original expect(subject).to receive(:handle_differences_in_range).once.and_call_original
subject.execute subject.execute
end end
end end
end end
context 'when there are unused registries' do
let(:records) { create_list(model_class_factory, batch_size) }
let(:unused_registry_ids) { [records.first].map(&:id) }
let!(:registries) do
records.map do |record|
create(registry_class_factory, model_foreign_key => record.id)
end
end
before do
model_class.where(id: unused_registry_ids).delete_all
end
it 'deletes unused registries', :sidekiq_inline do
subject.execute
expect(registry_class.where(model_foreign_key => unused_registry_ids)).to be_empty
end
it 'returns truthy' do
expect(subject.execute).to be_truthy
end
end
context 'when all replicable records have registries' do context 'when all replicable records have registries' do
it 'does nothing' do it 'does nothing' do
create_list(model_class.underscore.to_sym, batch_size) create_list(model_class_factory, batch_size)
subject.execute # create the missing registries subject.execute # create the missing registries
...@@ -153,7 +185,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor ...@@ -153,7 +185,7 @@ describe Geo::RegistryConsistencyService, :geo_fdw, :use_clean_rails_memory_stor
end end
it 'returns falsey' do it 'returns falsey' do
create_list(model_class.underscore.to_sym, batch_size) create_list(model_class_factory, batch_size)
subject.execute # create the missing registries subject.execute # create the missing registries
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment