Commit edb5cac4 authored by Michael Kozono's avatar Michael Kozono

Use bulk inserts

parent 17ce21d7
...@@ -20,7 +20,19 @@ module Gitlab ...@@ -20,7 +20,19 @@ module Gitlab
def perform def perform
ensure_temporary_tracking_table_exists ensure_temporary_tracking_table_exists
# Since Postgres < 9.5 does not have ON CONFLICT DO NOTHING, and since
# doing inserts-if-not-exists without ON CONFLICT DO NOTHING would be
# slow, start with an empty table for Postgres < 9.5.
# That way we can do bulk inserts at ~30x the speed of individual
# inserts (~20 minutes worth of inserts at GitLab.com scale instead of
# ~10 hours).
# In all other cases, installations will get both bulk inserts and the
# ability for these jobs to retry without having to clear and reinsert.
clear_untracked_file_paths unless can_bulk_insert_and_ignore_duplicates?
store_untracked_file_paths store_untracked_file_paths
schedule_populate_untracked_uploads_jobs schedule_populate_untracked_uploads_jobs
end end
...@@ -44,6 +56,10 @@ module Gitlab ...@@ -44,6 +56,10 @@ module Gitlab
end end
end end
def clear_untracked_file_paths
UntrackedFile.delete_all
end
def store_untracked_file_paths def store_untracked_file_paths
return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR) return unless Dir.exist?(ABSOLUTE_UPLOAD_DIR)
...@@ -96,36 +112,35 @@ module Gitlab ...@@ -96,36 +112,35 @@ module Gitlab
end end
def insert_file_paths(file_paths) def insert_file_paths(file_paths)
ActiveRecord::Base.transaction do sql = if postgresql_pre_9_5?
file_paths.each do |file_path| "INSERT INTO #{table_columns_and_values_for_insert(file_paths)};"
insert_file_path(file_path) elsif postgresql?
end "INSERT INTO #{table_columns_and_values_for_insert(file_paths)} ON CONFLICT DO NOTHING;"
end else # MySQL
end "INSERT IGNORE INTO #{table_columns_and_values_for_insert(file_paths)};"
end
def insert_file_path(file_path) ActiveRecord::Base.connection.execute(sql)
if postgresql_pre_9_5? end
# No easy way to do ON CONFLICT DO NOTHING before Postgres 9.5 so just use Rails
return UntrackedFile.where(path: file_path).first_or_create
end
table_columns_and_values = 'untracked_files_for_uploads (path, created_at, updated_at) VALUES (?, ?, ?)' def table_columns_and_values_for_insert(file_paths)
timestamp = Time.now.utc.iso8601
sql = if postgresql? values = file_paths.map do |file_path|
"INSERT INTO #{table_columns_and_values} ON CONFLICT DO NOTHING;" ActiveRecord::Base.send(:sanitize_sql_array, ['(?, ?, ?)', file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
else end.join(', ')
"INSERT IGNORE INTO #{table_columns_and_values};"
end
timestamp = Time.now.utc.iso8601 "#{UntrackedFile.table_name} (path, created_at, updated_at) VALUES #{values}"
sql = ActiveRecord::Base.send(:sanitize_sql_array, [sql, file_path, timestamp, timestamp]) # rubocop:disable GitlabSecurity/PublicSend
ActiveRecord::Base.connection.execute(sql)
end end
def postgresql? def postgresql?
@postgresql ||= Gitlab::Database.postgresql? @postgresql ||= Gitlab::Database.postgresql?
end end
def can_bulk_insert_and_ignore_duplicates?
!postgresql_pre_9_5?
end
def postgresql_pre_9_5? def postgresql_pre_9_5?
@postgresql_pre_9_5 ||= postgresql? && @postgresql_pre_9_5 ||= postgresql? &&
ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500 ActiveRecord::Base.connection.select_value('SHOW server_version_num').to_i < 90500
......
...@@ -53,80 +53,178 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side ...@@ -53,80 +53,178 @@ describe Gitlab::BackgroundMigration::PrepareUntrackedUploads, :migration, :side
expect(record.reload.path.size).to eq(519) expect(record.reload.path.size).to eq(519)
end end
context 'when files were uploaded before and after hashed storage was enabled' do context "test bulk insert with ON CONFLICT DO NOTHING or IGNORE" do
let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) } around do |example|
let!(:user) { create(:user, :with_avatar) } # If this is CI, we use Postgres 9.2 so this whole context should be
let!(:project1) { create(:project, :with_avatar) } # skipped since we're unable to use ON CONFLICT DO NOTHING or IGNORE.
let(:project2) { create(:project) } # instantiate after enabling hashed_storage if described_class.new.send(:can_bulk_insert_and_ignore_duplicates?)
example.run
end
end
before do context 'when files were uploaded before and after hashed storage was enabled' do
# Markdown upload before enabling hashed_storage let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
UploadService.new(project1, uploaded_file, FileUploader).execute let!(:user) { create(:user, :with_avatar) }
let!(:project1) { create(:project, :with_avatar) }
let(:project2) { create(:project) } # instantiate after enabling hashed_storage
stub_application_setting(hashed_storage_enabled: true) before do
# Markdown upload before enabling hashed_storage
UploadService.new(project1, uploaded_file, FileUploader).execute
# Markdown upload after enabling hashed_storage stub_application_setting(hashed_storage_enabled: true)
UploadService.new(project2, uploaded_file, FileUploader).execute
end
it 'adds unhashed files to the untracked_files_for_uploads table' do # Markdown upload after enabling hashed_storage
described_class.new.perform UploadService.new(project2, uploaded_file, FileUploader).execute
end
expect(untracked_files_for_uploads.count).to eq(5) it 'adds unhashed files to the untracked_files_for_uploads table' do
end described_class.new.perform
it 'adds files with paths relative to CarrierWave.root' do expect(untracked_files_for_uploads.count).to eq(5)
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
end end
end
it 'does not add hashed files to the untracked_files_for_uploads table' do it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform described_class.new.perform
untracked_files_for_uploads.all.each do |file|
hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path expect(file.path.start_with?('uploads/')).to be_truthy
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey end
end end
it 'correctly schedules the follow-up background migration jobs' do it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform described_class.new.perform
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5) hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
expect(BackgroundMigrationWorker.jobs.size).to eq(1) expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end end
# E.g. from a previous failed run of this background migration it 'correctly schedules the follow-up background migration jobs' do
context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform described_class.new.perform
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end end
it 'does not error or produce duplicates of existing data' do # E.g. from a previous failed run of this background migration
expect do context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5) end
it 'does not error or produce duplicates of existing data' do
expect do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
end end
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
before do
FileUtils.touch(tmp_file)
end
after do
FileUtils.rm(tmp_file)
end
it 'does not add files from /uploads/tmp' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
end
end
end
context 'test bulk insert without ON CONFLICT DO NOTHING or IGNORE' do
before do
# If this is CI, we use Postgres 9.2 so this stub has no effect.
#
# If this is being run on Postgres 9.5+ or MySQL, then this stub allows us
# to test the bulk insert functionality without ON CONFLICT DO NOTHING or
# IGNORE.
allow_any_instance_of(described_class).to receive(:postgresql_pre_9_5?).and_return(true)
end end
# E.g. The installation is in use at the time of migration, and someone has context 'when files were uploaded before and after hashed storage was enabled' do
# just uploaded a file let!(:appearance) { create(:appearance, logo: uploaded_file, header_logo: uploaded_file) }
context 'when there are files in /uploads/tmp' do let!(:user) { create(:user, :with_avatar) }
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') } let!(:project1) { create(:project, :with_avatar) }
let(:project2) { create(:project) } # instantiate after enabling hashed_storage
before do before do
FileUtils.touch(tmp_file) # Markdown upload before enabling hashed_storage
end UploadService.new(project1, uploaded_file, FileUploader).execute
after do stub_application_setting(hashed_storage_enabled: true)
FileUtils.rm(tmp_file)
# Markdown upload after enabling hashed_storage
UploadService.new(project2, uploaded_file, FileUploader).execute
end end
it 'does not add files from /uploads/tmp' do it 'adds unhashed files to the untracked_files_for_uploads table' do
described_class.new.perform described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5) expect(untracked_files_for_uploads.count).to eq(5)
end end
it 'adds files with paths relative to CarrierWave.root' do
described_class.new.perform
untracked_files_for_uploads.all.each do |file|
expect(file.path.start_with?('uploads/')).to be_truthy
end
end
it 'does not add hashed files to the untracked_files_for_uploads table' do
described_class.new.perform
hashed_file_path = project2.uploads.where(uploader: 'FileUploader').first.path
expect(untracked_files_for_uploads.where("path like '%#{hashed_file_path}%'").exists?).to be_falsey
end
it 'correctly schedules the follow-up background migration jobs' do
described_class.new.perform
expect(described_class::FOLLOW_UP_MIGRATION).to be_scheduled_migration(1, 5)
expect(BackgroundMigrationWorker.jobs.size).to eq(1)
end
# E.g. from a previous failed run of this background migration
context 'when there is existing data in untracked_files_for_uploads' do
before do
described_class.new.perform
end
it 'does not error or produce duplicates of existing data' do
expect do
described_class.new.perform
end.not_to change { untracked_files_for_uploads.count }.from(5)
end
end
# E.g. The installation is in use at the time of migration, and someone has
# just uploaded a file
context 'when there are files in /uploads/tmp' do
let(:tmp_file) { Rails.root.join(described_class::ABSOLUTE_UPLOAD_DIR, 'tmp', 'some_file.jpg') }
before do
FileUtils.touch(tmp_file)
end
after do
FileUtils.rm(tmp_file)
end
it 'does not add files from /uploads/tmp' do
described_class.new.perform
expect(untracked_files_for_uploads.count).to eq(5)
end
end
end end
end end
......
...@@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do ...@@ -62,8 +62,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes) expect(appearance.reload.uploads.where("path like '%/header_logo/%'").first.attributes).to include(@appearance_header_logo_attributes)
expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes) expect(user2.reload.uploads.first.attributes).to include(@user2_avatar_attributes)
expect(project2.reload.uploads.first.attributes).to include(@project2_avatar_attributes) expect(project2.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project2_avatar_attributes)
expect(project2.uploads.last.attributes).to include(@project2_markdown_attributes) expect(project2.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project2_markdown_attributes)
end end
it 'ignores already-tracked uploads' do it 'ignores already-tracked uploads' do
...@@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do ...@@ -71,8 +71,8 @@ describe TrackUntrackedUploads, :migration, :sidekiq do
expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes) expect(appearance.reload.uploads.where("path like '%/logo/%'").first.attributes).to include(@appearance_logo_attributes)
expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes) expect(user1.reload.uploads.first.attributes).to include(@user1_avatar_attributes)
expect(project1.reload.uploads.first.attributes).to include(@project1_avatar_attributes) expect(project1.reload.uploads.where(uploader: 'AvatarUploader').first.attributes).to include(@project1_avatar_attributes)
expect(project1.uploads.last.attributes).to include(@project1_markdown_attributes) expect(project1.uploads.where(uploader: 'FileUploader').first.attributes).to include(@project1_markdown_attributes)
end end
it 'the temporary table untracked_files_for_uploads no longer exists' do it 'the temporary table untracked_files_for_uploads no longer exists' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment