Commit 7ca18a9b authored by Andreas Brandl's avatar Andreas Brandl

Merge branch '35749-improve-performance-of-lfs_object-queries' into 'master'

Move batching  from AR query to oids array

See merge request gitlab-org/gitlab!19709
parents 663d27e8 de145876
......@@ -18,6 +18,11 @@ class LfsObject < ApplicationRecord
after_save :update_file_store, if: :saved_change_to_file?
def self.not_linked_to_project(project)
where('NOT EXISTS (?)',
project.lfs_objects_projects.select(1).where('lfs_objects_projects.lfs_object_id = lfs_objects.id'))
end
def update_file_store
# The file.object_store is set during `uploader.store!`
# which happens after object is inserted/updated
......
......@@ -4,6 +4,9 @@
module Projects
module LfsPointers
class LfsLinkService < BaseService
TooManyOidsError = Class.new(StandardError)
MAX_OIDS = 100_000
BATCH_SIZE = 1000
# Accept an array of oids to link
......@@ -12,6 +15,10 @@ module Projects
def execute(oids)
return [] unless project&.lfs_enabled?
if oids.size > MAX_OIDS
raise TooManyOidsError, 'Too many LFS object ids to link, please push them manually'
end
# Search and link existing LFS Object
link_existing_lfs_objects(oids)
end
......@@ -20,22 +27,27 @@ module Projects
# rubocop: disable CodeReuse/ActiveRecord
def link_existing_lfs_objects(oids)
all_existing_objects = []
linked_existing_objects = []
iterations = 0
LfsObject.where(oid: oids).each_batch(of: BATCH_SIZE) do |existent_lfs_objects|
oids.each_slice(BATCH_SIZE) do |oids_batch|
# Load all existing LFS Objects immediately so we don't issue an extra
# query for the `.any?`
existent_lfs_objects = LfsObject.where(oid: oids_batch).load
next unless existent_lfs_objects.any?
rows = existent_lfs_objects
.not_linked_to_project(project)
.map { |existing_lfs_object| { project_id: project.id, lfs_object_id: existing_lfs_object.id } }
Gitlab::Database.bulk_insert(:lfs_objects_projects, rows)
iterations += 1
not_linked_lfs_objects = existent_lfs_objects.where.not(id: project.all_lfs_objects)
project.all_lfs_objects << not_linked_lfs_objects
all_existing_objects += existent_lfs_objects.pluck(:oid)
linked_existing_objects += existent_lfs_objects.map(&:oid)
end
log_lfs_link_results(all_existing_objects.count, iterations)
log_lfs_link_results(linked_existing_objects.count, iterations)
all_existing_objects
linked_existing_objects
end
# rubocop: enable CodeReuse/ActiveRecord
......
---
title: Improve performance of linking LFS objects during import
merge_request: 19709
author:
type: performance
......@@ -3,6 +3,18 @@
require 'spec_helper'
describe LfsObject do
context 'scopes' do
describe '.not_existing_in_project' do
it 'contains only lfs objects not linked to the project' do
project = create(:project)
create(:lfs_objects_project, project: project)
other_lfs_object = create(:lfs_object)
expect(described_class.not_linked_to_project(project)).to contain_exactly(other_lfs_object)
end
end
end
it 'has a distinct has_many :projects relation through lfs_objects_projects' do
lfs_object = create(:lfs_object)
project = create(:project)
......
......@@ -16,6 +16,13 @@ describe Projects::LfsPointers::LfsLinkService do
end
describe '#execute' do
it 'raises an error when trying to link too many objects at once' do
oids = Array.new(described_class::MAX_OIDS) { |i| "oid-#{i}" }
oids << 'the straw'
expect { subject.execute(oids) }.to raise_error(described_class::TooManyOidsError)
end
it 'links existing lfs objects to the project' do
expect(project.all_lfs_objects.count).to eq 2
......@@ -28,7 +35,7 @@ describe Projects::LfsPointers::LfsLinkService do
it 'returns linked oids' do
linked = lfs_objects_project.map(&:lfs_object).map(&:oid) << new_lfs_object.oid
expect(subject.execute(new_oid_list.keys)).to eq linked
expect(subject.execute(new_oid_list.keys)).to contain_exactly(*linked)
end
it 'links in batches' do
......@@ -48,5 +55,26 @@ describe Projects::LfsPointers::LfsLinkService do
expect(project.all_lfs_objects.count).to eq 9
expect(linked.size).to eq 7
end
it 'only queries for the batch that will be processed', :aggregate_failures do
stub_const("#{described_class}::BATCH_SIZE", 1)
oids = %w(one two)
expect(LfsObject).to receive(:where).with(oid: %w(one)).once.and_call_original
expect(LfsObject).to receive(:where).with(oid: %w(two)).once.and_call_original
subject.execute(oids)
end
it 'only queries 3 times' do
# make sure that we don't count the queries in the setup
new_oid_list
# These are repeated for each batch of oids: maximum (MAX_OIDS / BATCH_SIZE) times
# 1. Load the batch of lfs object ids that we might know already
# 2. Load the objects that have not been linked to the project yet
# 3. Insert the lfs_objects_projects for that batch
expect { subject.execute(new_oid_list.keys) }.not_to exceed_query_limit(3)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment