Commit edb5cac4 authored by Michael Kozono's avatar Michael Kozono

Use bulk inserts

parent 17ce21d7
...@@ -20,7 +20,19 @@ module Gitlab ...@@ -20,7 +20,19 @@ module Gitlab
def perform def perform
ensure_temporary_tracking_table_exists ensure_temporary_tracking_table_exists
# Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
# doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
# slow, start with an empty table for Postgres < 9.5.
# That way we can do bulk inserts at ~30x the speed of individual
# inserts (~20 minutes worth of inserts at GitLab.com scale instead of
# ~10 hours).
# In all other cases, installations will get both bulk inserts and the
# ability for these jobs to retry without having to clear and reinsert.
clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
store_untracked_file_paths store_untracked_file_paths
schedule_populate_untracked_uploads_jobs schedule_populate_untracked_uploads_jobs
end end
...@@ -44,6 +56,10 @@ module Gitlab ...@@ -44,6 +56,10 @@ module Gitlab
end end
end end
def clear_untracked_file_paths
UntrackedFile.delete_all
end
def store_untracked_file_paths def store_untracked_file_paths
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR) return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
...@@ -96,36 +112,35 @@ module Gitlab ...@@ -96,36 +112,35 @@ module Gitlab
end end
def insert_file_paths(file_paths) def insert_file_paths(file_paths)
ActiveRecord::Base.transaction do sql = if postgresql_pre_9_5?
file_paths.each do |file_path| "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
insert_file_path(file_path) elsif postgresql?
end "INSERT INTO #{table_columns_and_values_for_insert(file_paths)} ON CONFLICT DO NOTHING;"
end else # MySQL
"INSERT IGNORE INTO #{table_columns_and_values_for_insert(file_paths)};"
end end
def insert_file_path(file_path) ActiveRecord::Base.connection.execute(sql)
if postgresql_pre_9_5?
# No easy way to do ON CONFLICT DO NOTHING before Postgres 9.5 so just use Rails
return UntrackedFile.where(path: file_path).first_or_create
end end
table_columns_and_values = 'untracked_files_for_uploads (path, created_at, updated_at) VALUES (?, ?, ?)' def table_columns_and_values_for_insert(file_paths)
timestamp = Time.now.utc.iso8601
sql = if postgresql? values = file_paths.map do |file_path|
"INSERT INTO #{table_columns_and_values} ON CONFLICT DO NOTHING;" ActiveRecord::Base.send(:sanitize_sql_array, ['(?, ?, ?)', file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
else end.join(', ')
"INSERT IGNORE INTO #{table_columns_and_values};"
end
timestamp = Time.now.utc.iso8601 "#{UntrackedFile.table_name} (path, created_at, updated_at) VALUES #{values}"
sql = ActiveRecord::Base.send(:sanitize_sql_array, [sql, file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
ActiveRecord::Base.connection.execute(sql)
end end
def postgresql? def postgresql?
@postgresql ||= Gitlab::Database.postgresql? @postgresql ||= Gitlab::Database.postgresql?
end end
def can_bulk_insert_and_ignore_duplicates?
!postgresql_pre_9_5?
end
def postgresql_pre_9_5? def postgresql_pre_9_5?
@postgresql_pre_9_5 ||= postgresql? && @postgresql_pre_9_5 ||= postgresql? &&
ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500 ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500
......
...@@ -53,6 +53,15 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side ...@@ -53,6 +53,15 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side
expect(record.reload.path.size).to eq(519) expect(record.reload.path.size).to eq(519)
end end
context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE" do
around do |example|
# If this is CI, we use Postgres 9.2 so this whole context should be
# skipped since we're unable to use ON CONFLICT DO NOTHING or IGNORE.
if described_class.new.send(:can_bulk_insert_and_ignore_duplicates?)
example.run
end
end
context 'when files were uploaded before and after hashed storage was enabled' do context 'when files were uploaded before and after hashed storage was enabled' do
let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) } let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
let!(:user) { create(:user, :with_avatar) } let!(:user) { create(:user, :with_avatar) }
...@@ -129,6 +138,95 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side ...@@ -129,6 +138,95 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side
end end
end end
end end
end
context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do
before do
# If this is CI, we use Postgres 9.2 so this stub has no effect.
#
# If this is being run on Postgres 9.5+ or MySQL, then this stub allows us
# to test the bulk insert functionality without ON CONFLICT DO NOTHING or
# IGNORE.
allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true)
end
context 'when files were uploaded before and after hashed storage was enabled' do
let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
let!(:user) { create(:user, :with_avatar) }
let!(:project1) { create(:project, :with_avatar) }
let(:project2) { create(:project) } # instantiate after enabling hashed_storage
before do
# Markdown upload before enabling hashed_storage
UploadService.new(project1, uploaded_file, FileUploader).execute
stub_application_setting(hashed_storage_enabled: true)
# Markdown upload after enabling hashed_storage
UploadService.new(project2, uploaded_file, FileUploader).execute
end
it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
end
end
it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform
hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end
it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
# E.g. from a previous failed run of this background migration
context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform
end
it 'does not error or produce duplicates of existing data' do
expect do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
end
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
before do
FileUtils.touch(tmp_file)
end
after do
FileUtils.rm(tmp_file)
end
it 'does not add files from /uploads/tmp' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
end
end
end
# Very new or lightly-used installations that are running this migration # Very new or lightly-used installations that are running this migration
# may not have an upload directory because they have no uploads. # may not have an upload directory because they have no uploads.
......
...@@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do ...@@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes) expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes)
expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes) expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes)
expect(project2.reload.uploads.first.attributes).to include(@project2_avatar_attributes) expect(project2.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project2_avatar_attributes)
expect(project2.uploads.last.attributes).to include(@project2_markdown_attributes) expect(project2.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project2_markdown_attributes)
end end
it 'ignores already-tracked uploads' do it 'ignores already-tracked uploads' do
...@@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do ...@@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes) expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes)
expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes) expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes)
expect(project1.reload.uploads.first.attributes).to include(@project1_avatar_attributes) expect(project1.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project1_avatar_attributes)
expect(project1.uploads.last.attributes).to include(@project1_markdown_attributes) expect(project1.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project1_markdown_attributes)
end end
it 'the temporary table untracked_files_for_uploads no longer exists' do it 'the temporary table untracked_files_for_uploads no longer exists' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment