Commit 1d475ae1 authored by Douglas Barbosa Alexandre's avatar Douglas Barbosa Alexandre Committed by Nick Thomas

Refactor query to retrieve attachments when selective sync is enabled

We have a couple of places that select namespaces.* where
select namespaces.id would do the trick. This commit applies
the same concept that we have for projects to namespaces.
parent 57cf0012
...@@ -45,10 +45,10 @@ module Geo ...@@ -45,10 +45,10 @@ module Geo
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def find_unsynced(batch_size:, except_file_ids: []) def find_unsynced(batch_size:, except_file_ids: [])
relation = relation =
if use_legacy_queries? if use_legacy_queries_for_selective_sync?
legacy_find_unsynced(except_file_ids: except_file_ids) legacy_finder.attachments_unsynced(except_file_ids: except_file_ids)
else else
fdw_find_unsynced(except_file_ids: except_file_ids) attachments_unsynced(except_file_ids: except_file_ids)
end end
relation.limit(batch_size) relation.limit(batch_size)
...@@ -58,10 +58,10 @@ module Geo ...@@ -58,10 +58,10 @@ module Geo
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def find_migrated_local(batch_size:, except_file_ids: []) def find_migrated_local(batch_size:, except_file_ids: [])
relation = relation =
if use_legacy_queries? if use_legacy_queries_for_selective_sync?
legacy_find_migrated_local(except_file_ids: except_file_ids) legacy_finder.attachments_migrated_local(except_file_ids: except_file_ids)
else else
fdw_find_migrated_local(except_file_ids: except_file_ids) attachments_migrated_local(except_file_ids: except_file_ids)
end end
relation.limit(batch_size) relation.limit(batch_size)
...@@ -113,6 +113,10 @@ module Geo ...@@ -113,6 +113,10 @@ module Geo
end end
end end
def attachments
fdw_geo_node.attachments
end
def attachments_synced def attachments_synced
if use_legacy_queries_for_selective_sync? if use_legacy_queries_for_selective_sync?
legacy_finder.attachments_synced legacy_finder.attachments_synced
...@@ -123,6 +127,21 @@ module Geo ...@@ -123,6 +127,21 @@ module Geo
end end
end end
def attachments_migrated_local(except_file_ids:)
attachments
.inner_join_file_registry
.with_files_stored_remotely
.merge(Geo::FileRegistry.attachments)
.id_not_in(except_file_ids)
end
def attachments_unsynced(except_file_ids:)
attachments
.missing_file_registry
.syncable
.id_not_in(except_file_ids)
end
def attachments_failed def attachments_failed
if use_legacy_queries_for_selective_sync? if use_legacy_queries_for_selective_sync?
legacy_finder.attachments_failed legacy_finder.attachments_failed
...@@ -143,90 +162,5 @@ module Geo ...@@ -143,90 +162,5 @@ module Geo
.merge(Geo::FileRegistry.missing_on_primary) .merge(Geo::FileRegistry.missing_on_primary)
end end
end end
# rubocop: disable CodeReuse/ActiveRecord
def attachments
if selective_sync?
Geo::Fdw::Upload.where(group_uploads.or(project_uploads).or(other_uploads))
else
Geo::Fdw::Upload.all
end
end
# rubocop: enable CodeReuse/ActiveRecord
# rubocop: disable CodeReuse/ActiveRecord
def group_uploads
namespace_ids =
if current_node.selective_sync_by_namespaces?
Gitlab::ObjectHierarchy.new(fdw_geo_node.namespaces).base_and_descendants.select(:id)
elsif current_node.selective_sync_by_shards?
leaf_groups = Geo::Fdw::Namespace.where(id: fdw_geo_node.projects.select(:namespace_id))
Gitlab::ObjectHierarchy.new(leaf_groups).base_and_ancestors.select(:id)
else
Namespace.none
end
# This query was intentionally converted to a raw one to get it work in Rails 5.0.
# In Rails 5.0 and 5.1 there's a bug: https://github.com/rails/arel/issues/531
# Please convert it back when on rails 5.2 as it works again as expected since 5.2.
namespace_ids_in_sql = Arel::Nodes::SqlLiteral.new("#{fdw_upload_table.name}.#{fdw_upload_table[:model_id].name} IN (#{namespace_ids.to_sql})")
fdw_upload_table[:model_type].eq('Namespace').and(namespace_ids_in_sql)
end
# rubocop: enable CodeReuse/ActiveRecord
def project_uploads
project_ids = fdw_geo_node.projects.select(:id)
# This query was intentionally converted to a raw one to get it work in Rails 5.0.
# In Rails 5.0 and 5.1 there's a bug: https://github.com/rails/arel/issues/531
# Please convert it back when on rails 5.2 as it works again as expected since 5.2.
project_ids_in_sql = Arel::Nodes::SqlLiteral.new("#{fdw_upload_table.name}.#{fdw_upload_table[:model_id].name} IN (#{project_ids.to_sql})")
fdw_upload_table[:model_type].eq('Project').and(project_ids_in_sql)
end
def other_uploads
fdw_upload_table[:model_type].not_in(%w[Namespace Project])
end
def fdw_upload_table
Geo::Fdw::Upload.arel_table
end
def fdw_find_unsynced(except_file_ids:)
attachments
.missing_file_registry
.syncable
.id_not_in(except_file_ids)
end
def fdw_find_migrated_local(except_file_ids:)
attachments
.inner_join_file_registry
.with_files_stored_remotely
.merge(Geo::FileRegistry.attachments)
.id_not_in(except_file_ids)
end
def legacy_find_unsynced(except_file_ids:)
registry_file_ids = Geo::FileRegistry.attachments.pluck_file_key | except_file_ids
legacy_left_outer_join_registry_ids(
syncable,
registry_file_ids,
Upload
)
end
def legacy_find_migrated_local(except_file_ids:)
registry_file_ids = Geo::FileRegistry.attachments.pluck_file_key - except_file_ids
legacy_inner_join_registry_ids(
legacy_finder.attachments.with_files_stored_remotely,
registry_file_ids,
Upload
)
end
end end
end end
...@@ -6,16 +6,6 @@ module Geo ...@@ -6,16 +6,6 @@ module Geo
attachments.syncable attachments.syncable
end end
# rubocop:disable CodeReuse/ActiveRecord
def attachments
if selective_sync?
Upload.where(group_uploads.or(project_uploads).or(other_uploads))
else
Upload.all
end
end
# rubocop:enable CodeReuse/ActiveRecord
def attachments_synced def attachments_synced
legacy_inner_join_registry_ids( legacy_inner_join_registry_ids(
syncable, syncable,
...@@ -24,6 +14,26 @@ module Geo ...@@ -24,6 +14,26 @@ module Geo
) )
end end
def attachments_migrated_local(except_file_ids:)
registry_file_ids = Geo::FileRegistry.attachments.pluck_file_key - except_file_ids
legacy_inner_join_registry_ids(
attachments.with_files_stored_remotely,
registry_file_ids,
Upload
)
end
def attachments_unsynced(except_file_ids:)
registry_file_ids = Geo::FileRegistry.attachments.pluck_file_key | except_file_ids
legacy_left_outer_join_registry_ids(
syncable,
registry_file_ids,
Upload
)
end
def attachments_failed def attachments_failed
legacy_inner_join_registry_ids( legacy_inner_join_registry_ids(
syncable, syncable,
...@@ -53,44 +63,8 @@ module Geo ...@@ -53,44 +63,8 @@ module Geo
private private
# rubocop:disable CodeReuse/ActiveRecord def attachments
def group_uploads current_node.attachments
namespace_ids =
if current_node.selective_sync_by_namespaces?
Gitlab::ObjectHierarchy.new(current_node.namespaces).base_and_descendants.select(:id)
elsif current_node.selective_sync_by_shards?
leaf_groups = Namespace.where(id: current_node.projects.select(:namespace_id))
Gitlab::ObjectHierarchy.new(leaf_groups).base_and_ancestors.select(:id)
else
Namespace.none
end
# This query was intentionally converted to a raw one to get it work in Rails 5.0.
# In Rails 5.0 and 5.1 there's a bug: https://github.com/rails/arel/issues/531
# Please convert it back when on rails 5.2 as it works again as expected since 5.2.
namespace_ids_in_sql = Arel::Nodes::SqlLiteral.new("#{upload_table.name}.#{upload_table[:model_id].name} IN (#{namespace_ids.to_sql})")
upload_table[:model_type].eq('Namespace').and(namespace_ids_in_sql)
end
# rubocop:enable CodeReuse/ActiveRecord
def project_uploads
project_ids = current_node.projects.select(:id)
# This query was intentionally converted to a raw one to get it work in Rails 5.0.
# In Rails 5.0 and 5.1 there's a bug: https://github.com/rails/arel/issues/531
# Please convert it back when on rails 5.2 as it works again as expected since 5.2.
project_ids_in_sql = Arel::Nodes::SqlLiteral.new("#{upload_table.name}.#{upload_table[:model_id].name} IN (#{project_ids.to_sql})")
upload_table[:model_type].eq('Project').and(project_ids_in_sql)
end
def other_uploads
upload_table[:model_type].not_in(%w[Namespace Project])
end
def upload_table
Upload.arel_table
end end
end end
end end
...@@ -3,6 +3,14 @@ ...@@ -3,6 +3,14 @@
module Geo::SelectiveSync module Geo::SelectiveSync
extend ActiveSupport::Concern extend ActiveSupport::Concern
def attachments
if selective_sync?
uploads_model.where(group_attachments.or(project_attachments).or(other_attachments))
else
uploads_model.all
end
end
def selective_sync? def selective_sync?
selective_sync_type.present? selective_sync_type.present?
end end
...@@ -19,8 +27,7 @@ module Geo::SelectiveSync ...@@ -19,8 +27,7 @@ module Geo::SelectiveSync
def selected_namespaces_and_descendants def selected_namespaces_and_descendants
relation = selected_namespaces_and_descendants_cte.apply_to(namespaces_model.all) relation = selected_namespaces_and_descendants_cte.apply_to(namespaces_model.all)
relation.extend(Gitlab::Database::ReadOnlyRelation) read_only(relation)
relation
end end
def selected_namespaces_and_descendants_cte def selected_namespaces_and_descendants_cte
...@@ -40,6 +47,80 @@ module Geo::SelectiveSync ...@@ -40,6 +47,80 @@ module Geo::SelectiveSync
cte cte
end end
def selected_leaf_namespaces_and_ancestors
relation = selected_leaf_namespaces_and_ancestors_cte.apply_to(namespaces_model.all)
read_only(relation)
end
# Returns a CTE selecting namespace IDs for selected shards
#
# When we need to sync resources that are only associated with namespaces,
# but the instance is selectively syncing by shard, we must sync every
# namespace of every project in those shards. We must also sync every
# ancestor of those namespaces.
def selected_leaf_namespaces_and_ancestors_cte
cte = Gitlab::SQL::RecursiveCTE.new(:base_and_ancestors)
cte << namespaces_model
.select(namespaces_table[:id], namespaces_table[:parent_id])
.where(id: projects.select(:namespace_id))
# Recursively get all the ancestors of the base set.
cte << namespaces_model
.select(namespaces_table[:id], namespaces_table[:parent_id])
.from([namespaces_table, cte.table])
.where(namespaces_table[:id].eq(cte.table[:parent_id]))
.except(:order)
cte
end
def read_only(relation)
# relations using a CTE are not safe to use with update_all as it will
# throw away the CTE, hence we mark them as read-only.
relation.extend(Gitlab::Database::ReadOnlyRelation)
relation
end
def group_attachments
namespaces =
if selective_sync_by_namespaces?
selected_namespaces_and_descendants
elsif selective_sync_by_shards?
selected_leaf_namespaces_and_ancestors
else
namespaces_model.none
end
attachments_for_model_type_with_id_in('Namespace', namespaces.select(:id))
end
def project_attachments
attachments_for_model_type_with_id_in('Project', projects.select(:id))
end
def other_attachments
uploads_table[:model_type].not_in(%w[Namespace Project])
end
def attachments_for_model_type_with_id_in(model_type, model_ids)
# This query was intentionally converted to a raw one to get it work in Rails 5.0.
# In Rails 5.0 and 5.1 there's a bug: https://github.com/rails/arel/issues/531
# Please convert it back when on rails 5.2 as it works again as expected since 5.2.
column_name = "#{uploads_table.name}.#{uploads_table[:model_id].name}"
raw_sql = Arel::Nodes::SqlLiteral.new("#{column_name} IN (#{model_ids.to_sql})")
uploads_table[:model_type].eq(model_type).and(raw_sql)
end
# This concern doesn't define a geo_node_namespace_links relation. That's
# done in ::GeoNode or ::Geo::Fdw::GeoNode respectively. So when we use the
# same code from the two places, they act differently - the first doesn't
# use FDW, the second does.
def geo_node_namespace_links_table
geo_node_namespace_links.arel_table
end
# This concern doesn't define a namespaces relation. That's done in ::GeoNode # This concern doesn't define a namespaces relation. That's done in ::GeoNode
# or ::Geo::Fdw::GeoNode respectively. So when we use the same code from the # or ::Geo::Fdw::GeoNode respectively. So when we use the same code from the
# two places, they act differently - the first doesn't use FDW, the second does. # two places, they act differently - the first doesn't use FDW, the second does.
...@@ -54,11 +135,13 @@ module Geo::SelectiveSync ...@@ -54,11 +135,13 @@ module Geo::SelectiveSync
namespaces.arel_table namespaces.arel_table
end end
# This concern doesn't define a geo_node_namespace_links relation. That's def uploads_model
# done in ::GeoNode or ::Geo::Fdw::GeoNode respectively. So when we use the raise NotImplementedError,
# same code from the two places, they act differently - the first doesn't "#{self.class} does not implement #{__method__}"
# use FDW, the second does. end
def geo_node_namespace_links_table
geo_node_namespace_links.arel_table def uploads_table
raise NotImplementedError,
"#{self.class} does not implement #{__method__}"
end end
end end
...@@ -71,6 +71,14 @@ module Geo ...@@ -71,6 +71,14 @@ module Geo
Gitlab::Geo::Fdw::ProjectRegistryQueryBuilder.new Gitlab::Geo::Fdw::ProjectRegistryQueryBuilder.new
.within_shards(selective_sync_shards) .within_shards(selective_sync_shards)
end end
def uploads_model
Geo::Fdw::Upload
end
def uploads_table
Geo::Fdw::Upload.arel_table
end
end end
end end
end end
...@@ -322,4 +322,12 @@ class GeoNode < ApplicationRecord ...@@ -322,4 +322,12 @@ class GeoNode < ApplicationRecord
def projects_for_selected_shards def projects_for_selected_shards
Project.within_shards(selective_sync_shards) Project.within_shards(selective_sync_shards)
end end
def uploads_model
Upload
end
def uploads_table
Upload.arel_table
end
end end
---
title: Geo - Implement selective sync support for the FDW queries to find attachments
merge_request: 11544
author:
type: changed
...@@ -49,64 +49,100 @@ describe Geo::AttachmentRegistryFinder, :geo do ...@@ -49,64 +49,100 @@ describe Geo::AttachmentRegistryFinder, :geo do
shared_examples 'finds all the things' do shared_examples 'finds all the things' do
describe '#find_unsynced' do describe '#find_unsynced' do
it 'returns uploads without an entry on the tracking database' do let!(:upload_1) { create(:upload, model: synced_group) }
let!(:upload_2) { create(:upload, model: unsynced_group) }
let!(:upload_3) { create(:upload, :issuable_upload, model: synced_project) }
let!(:upload_4) { create(:upload, model: unsynced_project) }
let!(:upload_5) { create(:upload, model: synced_project) }
let!(:upload_remote_1) { create(:upload, :object_storage, model: synced_project) }
it 'returns attachments without an entry on the tracking database' do
create(:geo_file_registry, :avatar, file_id: upload_1.id) create(:geo_file_registry, :avatar, file_id: upload_1.id)
uploads = subject.find_unsynced(batch_size: 10) attachments = subject.find_unsynced(batch_size: 10, except_file_ids: [upload_2.id])
expect(uploads).to match_ids(upload_2, upload_3, upload_4) expect(attachments).to match_ids(upload_3, upload_4, upload_5)
end end
it 'excludes uploads in the except_file_ids option' do context 'with selective sync by namespace' do
uploads = subject.find_unsynced(batch_size: 10, except_file_ids: [upload_2.id]) before do
secondary.update!(selective_sync_type: 'namespaces', namespaces: [synced_group])
end
it 'returns attachments without an entry on the tracking database' do
create(:geo_file_registry, :avatar, file_id: upload_1.id)
expect(uploads).to match_ids(upload_1, upload_3, upload_4) attachments = subject.find_unsynced(batch_size: 10, except_file_ids: [upload_5.id])
expect(attachments).to match_ids(upload_3)
end
end end
it 'excludes remote uploads' do context 'with selective sync by shard' do
upload_1.update!(store: ObjectStorage::Store::REMOTE) before do
secondary.update!(selective_sync_type: 'shards', selective_sync_shards: ['broken'])
end
it 'returns attachments without an entry on the tracking database' do
create(:geo_file_registry, :avatar, file_id: upload_2.id)
uploads = subject.find_unsynced(batch_size: 10) attachments = subject.find_unsynced(batch_size: 10)
expect(uploads).to match_ids(upload_2, upload_3, upload_4) expect(attachments).to match_ids(upload_4)
end
end end
end end
describe '#find_migrated_local' do describe '#find_migrated_local' do
it 'returns uploads stored remotely and successfully synced locally' do let!(:upload_1) { create(:upload, model: synced_group) }
upload = create(:upload, :object_storage, model: synced_group) let!(:upload_2) { create(:upload, model: unsynced_group) }
create(:geo_file_registry, :avatar, file_id: upload.id) let!(:upload_3) { create(:upload, :issuable_upload, model: synced_project) }
let!(:upload_4) { create(:upload, model: unsynced_project) }
let!(:upload_5) { create(:upload, model: synced_project) }
let!(:upload_remote_1) { create(:upload, :object_storage, model: synced_project) }
let!(:upload_remote_2) { create(:upload, :object_storage, model: unsynced_project) }
uploads = subject.find_migrated_local(batch_size: 100) before do
create(:geo_file_registry, :avatar, file_id: upload_remote_1.id)
create(:geo_file_registry, :avatar, file_id: upload_remote_2.id)
end
it 'returns attachments stored remotely and successfully synced locally' do
attachments = subject.find_migrated_local(batch_size: 100, except_file_ids: [upload_remote_2.id])
expect(uploads).to match_ids(upload) expect(attachments).to match_ids(upload_remote_1)
end end
it 'excludes uploads stored remotely, but not synced yet' do it 'excludes attachments stored remotely, but not synced yet' do
create(:upload, :object_storage, model: synced_group) create(:upload, :object_storage, model: synced_group)
uploads = subject.find_migrated_local(batch_size: 100) attachments = subject.find_migrated_local(batch_size: 100)
expect(uploads).to be_empty expect(attachments).to match_ids(upload_remote_1, upload_remote_2)
end end
it 'excludes synced uploads that are stored locally' do context 'with selective sync by namespace' do
create(:geo_file_registry, :avatar, file_id: upload_5.id) before do
secondary.update!(selective_sync_type: 'namespaces', namespaces: [synced_group])
end
uploads = subject.find_migrated_local(batch_size: 100) it 'returns attachments stored remotely and successfully synced locally' do
attachments = subject.find_migrated_local(batch_size: 10)
expect(uploads).to be_empty expect(attachments).to match_ids(upload_remote_1)
end
end end
it 'excludes except_file_ids' do context 'with selective sync by shard' do
upload_a = create(:upload, :object_storage, model: synced_group) before do
upload_b = create(:upload, :object_storage, model: unsynced_group) secondary.update!(selective_sync_type: 'shards', selective_sync_shards: ['broken'])
create(:geo_file_registry, :avatar, file_id: upload_a.id) end
create(:geo_file_registry, :avatar, file_id: upload_b.id)
uploads = subject.find_migrated_local(batch_size: 10, except_file_ids: [upload_a.id]) it 'returns attachments stored remotely and successfully synced locally' do
attachments = subject.find_migrated_local(batch_size: 10)
expect(uploads).to match_ids(upload_b) expect(attachments).to match_ids(upload_remote_2)
end
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment