Commit 0e831b0b authored by Nick Thomas's avatar Nick Thomas

Allow external diffs to be used conditionally

Since external diffs are likely to be a bit slower than in-database
ones, add a mode that makes diffs external after they've been obsoleted
by events. This should strike a balance between performance and disk
space.

A background cron drives the majority of migrations, since diffs become
outdated through user actions.
parent 98824f3e
......@@ -12,6 +12,10 @@ class MergeRequestDiff < ActiveRecord::Base
# Don't display more than 100 commits at once
COMMITS_SAFE_SIZE = 100
# Applies to closed or merged MRs when determining whether to migrate their
# diffs to external storage
EXTERNAL_DIFF_CUTOFF = 7.days.freeze
belongs_to :merge_request
manual_inverse_association :merge_request, :merge_request_diff
......@@ -48,6 +52,81 @@ class MergeRequestDiff < ActiveRecord::Base
end
scope :recent, -> { order(id: :desc).limit(100) }
scope :files_in_database, -> { where(stored_externally: [false, nil]) }
scope :not_latest_diffs, -> do
merge_requests = MergeRequest.arel_table
mr_diffs = arel_table
join_condition = merge_requests[:id].eq(mr_diffs[:merge_request_id])
.and(mr_diffs[:id].not_eq(merge_requests[:latest_merge_request_diff_id]))
arel_join = mr_diffs.join(merge_requests).on(join_condition)
joins(arel_join.join_sources)
end
scope :old_merged_diffs, -> (before) do
merge_requests = MergeRequest.arel_table
mr_metrics = MergeRequest::Metrics.arel_table
mr_diffs = arel_table
mr_join = mr_diffs
.join(merge_requests)
.on(mr_diffs[:merge_request_id].eq(merge_requests[:id]))
metrics_join_condition = mr_diffs[:merge_request_id]
.eq(mr_metrics[:merge_request_id])
.and(mr_metrics[:merged_at].not_eq(nil))
metrics_join = mr_diffs.join(mr_metrics).on(metrics_join_condition)
condition = MergeRequest.arel_table[:state].eq(:merged)
.and(MergeRequest::Metrics.arel_table[:merged_at].lteq(before))
.and(MergeRequest::Metrics.arel_table[:merged_at].not_eq(nil))
joins(metrics_join.join_sources, mr_join.join_sources).where(condition)
end
scope :old_closed_diffs, -> (before) do
condition = MergeRequest.arel_table[:state].eq(:closed)
.and(MergeRequest::Metrics.arel_table[:latest_closed_at].lteq(before))
joins(merge_request: :metrics).where(condition)
end
def self.ids_for_external_storage_migration(limit:)
# No point doing any work unless the feature is enabled
return [] unless Gitlab.config.external_diffs.enabled
case Gitlab.config.external_diffs.when
when 'always'
files_in_database.limit(limit).pluck(:id)
when 'outdated'
# Outdated is too complex to be a single SQL query, so split into three
before = EXTERNAL_DIFF_CUTOFF.ago
ids = files_in_database
.old_merged_diffs(before)
.limit(limit)
.pluck(:id)
return ids if ids.size >= limit
ids += files_in_database
.old_closed_diffs(before)
.limit(limit - ids.size)
.pluck(:id)
return ids if ids.size >= limit
ids + files_in_database
.not_latest_diffs
.limit(limit - ids.size)
.pluck(:id)
else
[]
end
end
mount_uploader :external_diff, ExternalDiffUploader
......@@ -55,7 +134,7 @@ class MergeRequestDiff < ActiveRecord::Base
# It allows you to override variables like head_commit_sha before getting diff.
after_create :save_git_content, unless: :importing?
after_save :update_external_diff_store, if: :external_diff_changed?
after_save :update_external_diff_store, if: -> { !importing? && external_diff_changed? }
def self.find_by_diff_refs(diff_refs)
find_by(start_commit_sha: diff_refs.start_sha, head_commit_sha: diff_refs.head_sha, base_commit_sha: diff_refs.base_sha)
......@@ -294,27 +373,31 @@ class MergeRequestDiff < ActiveRecord::Base
end
end
private
# Transactionally migrate the current merge_request_diff_files entries to
# external storage. If external storage isn't an option for this diff, the
# method is a no-op.
def migrate_files_to_external_storage!
return if stored_externally? || !use_external_diff? || merge_request_diff_files.count == 0
def encode_in_base64?(diff_text)
(diff_text.encoding == Encoding::BINARY && !diff_text.ascii_only?) ||
diff_text.include?("\0")
rows = build_merge_request_diff_files(merge_request_diff_files)
transaction do
MergeRequestDiffFile.where(merge_request_diff_id: id).delete_all
create_merge_request_diff_files(rows)
save!
end
def create_merge_request_diff_files(diffs)
rows =
if has_attribute?(:external_diff) && Gitlab.config.external_diffs.enabled
build_external_merge_request_diff_files(diffs)
else
build_merge_request_diff_files(diffs)
merge_request_diff_files.reload
end
# Faster inserts
Gitlab::Database.bulk_insert('merge_request_diff_files', rows)
private
def encode_in_base64?(diff_text)
(diff_text.encoding == Encoding::BINARY && !diff_text.ascii_only?) ||
diff_text.include?("\0")
end
def build_external_merge_request_diff_files(diffs)
rows = build_merge_request_diff_files(diffs)
def build_external_merge_request_diff_files(rows)
tempfile = build_external_diff_tempfile(rows)
self.external_diff = tempfile
......@@ -325,16 +408,21 @@ class MergeRequestDiff < ActiveRecord::Base
tempfile&.unlink
end
def create_merge_request_diff_files(rows)
rows = build_external_merge_request_diff_files(rows) if use_external_diff?
# Faster inserts
Gitlab::Database.bulk_insert('merge_request_diff_files', rows)
end
def build_external_diff_tempfile(rows)
Tempfile.open(external_diff.filename) do |file|
rows.inject(0) do |offset, row|
rows.each do |row|
data = row.delete(:diff)
row[:external_diff_offset] = offset
row[:external_diff_size] = data.size
row[:external_diff_offset] = file.pos
row[:external_diff_size] = data.bytesize
file.write(data)
offset + data.size
end
file
......@@ -361,6 +449,47 @@ class MergeRequestDiff < ActiveRecord::Base
end
end
def use_external_diff?
return false unless has_attribute?(:external_diff)
return false unless Gitlab.config.external_diffs.enabled
case Gitlab.config.external_diffs.when
when 'always'
true
when 'outdated'
outdated_by_merge? || outdated_by_closure? || old_version?
else
false # Disable external diffs if misconfigured
end
end
def outdated_by_merge?
return false unless merge_request&.metrics&.merged_at
merge_request.merged? && merge_request.metrics.merged_at < EXTERNAL_DIFF_CUTOFF.ago
end
def outdated_by_closure?
return false unless merge_request&.metrics&.latest_closed_at
merge_request.closed? && merge_request.metrics.latest_closed_at < EXTERNAL_DIFF_CUTOFF.ago
end
# We can't rely on `merge_request.latest_merge_request_diff_id` because that
# may have been changed in `save_git_content` without being reflected in
# the association's instance. This query is always subject to races, but
# the worst case is that we *don't* make a diff external when we could. The
# background worker will make it external at a later date.
def old_version?
latest_id = MergeRequest
.where(id: merge_request_id)
.limit(1)
.pluck(:latest_merge_request_diff_id)
.first
self.id != latest_id
end
def load_diffs(options)
# Ensure all diff files operate on the same external diff file instance if
# present. This reduces file open/close overhead.
......@@ -394,7 +523,8 @@ class MergeRequestDiff < ActiveRecord::Base
if diff_collection.any?
new_attributes[:state] = :collected
create_merge_request_diff_files(diff_collection)
rows = build_merge_request_diff_files(diff_collection)
create_merge_request_diff_files(rows)
end
# Set our state to 'overflow' to make the #empty? and #collected?
......
# frozen_string_literal: true
module MergeRequests
class MigrateExternalDiffsService < ::BaseService
MAX_JOBS = 1000.freeze
attr_reader :diff
def self.enqueue!
ids = MergeRequestDiff.ids_for_external_storage_migration(limit: MAX_JOBS)
MigrateExternalDiffsWorker.bulk_perform_async(ids.map { |id| [id] })
end
def initialize(merge_request_diff)
@diff = merge_request_diff
end
def execute
diff.migrate_files_to_external_storage!
end
end
end
......@@ -21,6 +21,7 @@
- cronjob:trending_projects
- cronjob:issue_due_scheduler
- cronjob:prune_web_hook_logs
- cronjob:schedule_migrate_external_diffs
- gcp_cluster:cluster_install_app
- gcp_cluster:cluster_patch_app
......@@ -119,6 +120,7 @@
- invalid_gpg_signature_update
- irker
- merge
- migrate_external_diffs
- namespaceless_project_destroy
- new_issue
- new_merge_request
......
# frozen_string_literal: false
class MigrateExternalDiffsWorker
include ApplicationWorker
def perform(merge_request_diff_id)
diff = MergeRequestDiff.find_by_id(merge_request_diff_id)
return unless diff
MergeRequests::MigrateExternalDiffsService.new(diff).execute
end
end
# frozen_string_literal: false
class ScheduleMigrateExternalDiffsWorker
include ApplicationWorker
include CronjobQueue
include Gitlab::ExclusiveLeaseHelpers
def perform
in_lock(self.class.name.underscore, ttl: 2.hours, retries: 0) do
MergeRequests::MigrateExternalDiffsService.enqueue!
end
rescue Gitlab::ExclusiveLeaseHelpers::FailedToObtainLockError
end
end
---
title: 'Allow external diffs to be used conditionally'
merge_request: 25432
author:
type: added
......@@ -301,6 +301,10 @@ production: &base
pages_domain_verification_cron_worker:
cron: "*/15 * * * *"
# Periodically migrate diffs from the database to external storage
schedule_migrate_external_diffs_worker:
cron: "15 * * * *"
registry:
# enabled: true
# host: registry.example.com
......@@ -787,6 +791,10 @@ test:
enabled: true
external_diffs:
enabled: false
# Diffs may be `always` external (the default), or they can be made external
# after they have become `outdated` (i.e., the MR is closed or a new version
# has been pushed).
# when: always
# The location where external diffs are stored (default: shared/external-diffs).
# storage_path: shared/external-diffs
object_store:
......
......@@ -238,6 +238,7 @@ Settings.pages.admin['certificate'] ||= ''
#
Settings['external_diffs'] ||= Settingslogic.new({})
Settings.external_diffs['enabled'] = false if Settings.external_diffs['enabled'].nil?
Settings.external_diffs['when'] = 'always' if Settings.external_diffs['when'].nil?
Settings.external_diffs['storage_path'] = Settings.absolute(Settings.external_diffs['storage_path'] || File.join(Settings.shared['path'], 'external-diffs'))
Settings.external_diffs['object_store'] = ObjectStoreSettings.parse(Settings.external_diffs['object_store'])
......@@ -344,6 +345,10 @@ Settings.cron_jobs['prune_web_hook_logs_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['prune_web_hook_logs_worker']['cron'] ||= '0 */1 * * *'
Settings.cron_jobs['prune_web_hook_logs_worker']['job_class'] = 'PruneWebHookLogsWorker'
Settings.cron_jobs['schedule_migrate_external_diffs_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['schedule_migrate_external_diffs_worker']['cron'] ||= '15 * * * *'
Settings.cron_jobs['schedule_migrate_external_diffs_worker']['job_class'] = 'ScheduleMigrateExternalDiffsWorker'
#
# Sidekiq
#
......
......@@ -89,3 +89,4 @@
- [project_daily_statistics, 1]
- [import_issues_csv, 2]
- [chat_notification, 2]
- [migrate_external_diffs, 1]
# frozen_string_literal: true
class AddIndexesForMergeRequestDiffsQuery < ActiveRecord::Migration[5.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
INDEX_SPECS = [
[
:merge_request_metrics,
:latest_closed_at,
{ where: 'latest_closed_at IS NOT NULL' }
],
[
:merge_request_metrics,
[:merge_request_id, :merged_at],
{ where: 'merged_at IS NOT NULL' }
],
[
:merge_request_diffs,
[:merge_request_id, :id],
{
name: 'index_merge_request_diffs_on_merge_request_id_and_id_partial',
where: 'NOT stored_externally OR stored_externally IS NULL'
}
]
].freeze
disable_ddl_transaction!
def up
INDEX_SPECS.each do |spec|
add_concurrent_index(*spec)
end
end
def down
INDEX_SPECS.reverse.each do |spec|
remove_concurrent_index(*spec)
end
end
end
......@@ -1256,6 +1256,7 @@ ActiveRecord::Schema.define(version: 20190322132835) do
t.integer "external_diff_store"
t.boolean "stored_externally"
t.index ["merge_request_id", "id"], name: "index_merge_request_diffs_on_merge_request_id_and_id", using: :btree
t.index ["merge_request_id", "id"], name: "index_merge_request_diffs_on_merge_request_id_and_id_partial", where: "((NOT stored_externally) OR (stored_externally IS NULL))", using: :btree
end
create_table "merge_request_metrics", force: :cascade do |t|
......@@ -1271,7 +1272,9 @@ ActiveRecord::Schema.define(version: 20190322132835) do
t.integer "latest_closed_by_id"
t.datetime_with_timezone "latest_closed_at"
t.index ["first_deployed_to_production_at"], name: "index_merge_request_metrics_on_first_deployed_to_production_at", using: :btree
t.index ["latest_closed_at"], name: "index_merge_request_metrics_on_latest_closed_at", where: "(latest_closed_at IS NOT NULL)", using: :btree
t.index ["latest_closed_by_id"], name: "index_merge_request_metrics_on_latest_closed_by_id", using: :btree
t.index ["merge_request_id", "merged_at"], name: "index_merge_request_metrics_on_merge_request_id_and_merged_at", where: "(merged_at IS NOT NULL)", using: :btree
t.index ["merge_request_id"], name: "index_merge_request_metrics", using: :btree
t.index ["merged_by_id"], name: "index_merge_request_metrics_on_merged_by_id", using: :btree
t.index ["pipeline_id"], name: "index_merge_request_metrics_on_pipeline_id", using: :btree
......
......@@ -145,3 +145,47 @@ The connection settings match those provided by [Fog](https://github.com/fog), a
```
1. Save the file and [restart GitLab](restart_gitlab.md#installations-from-source) for the changes to take effect.
### Alternative in-database storage
Enabling external diffs may reduce the performance of merge requests, as they
must be retrieved in a separate operation to other data. A compromise may be
reached by only storing outdated diffs externally, while keeping current diffs
in the database.
To enable this feature, perform the following steps:
**In Omnibus installations:**
1. Edit `/etc/gitlab/gitlab.rb` and add the following line:
```ruby
gitlab_rails['external_diffs_when'] = 'outdated'
```
1. Save the file and [reconfigure GitLab](restart_gitlab.md#omnibus-gitlab-reconfigure) for the changes to take effect.
**In installations from source:**
1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following
lines:
```yaml
external_diffs:
enabled: true
when: outdated
```
1. Save the file and [restart GitLab](restart_gitlab.md#installations-from-source) for the changes to take effect.
With this feature enabled, diffs will initially stored in the database, rather
than externally. They will be moved to external storage once any of these
conditions become true:
- A newer version of the merge request diff exists
- The merge request was merged more than seven days ago
- The merge request was closed more than seven day ago
These rules strike a balance between space and performance by only storing
frequently-accessed diffs in the database. Diffs that are less likely to be
accessed are moved to external storage instead.
......@@ -46,10 +46,26 @@ FactoryBot.define do
target_branch "improve/awesome"
end
trait :merged_last_month do
merged
after(:build) do |merge_request|
merge_request.build_metrics.merged_at = 1.month.ago
end
end
trait :closed do
state :closed
end
trait :closed_last_month do
closed
after(:build) do |merge_request|
merge_request.build_metrics.latest_closed_at = 1.month.ago
end
end
trait :opened do
state :opened
end
......
......@@ -51,7 +51,104 @@ describe MergeRequestDiff do
end
end
describe '#latest' do
describe '.ids_for_external_storage_migration' do
set(:merge_request) { create(:merge_request) }
set(:outdated) { merge_request.merge_request_diff }
set(:latest) { merge_request.create_merge_request_diff }
set(:closed_mr) { create(:merge_request, :closed_last_month) }
let(:closed) { closed_mr.merge_request_diff }
set(:merged_mr) { create(:merge_request, :merged_last_month) }
let(:merged) { merged_mr.merge_request_diff }
set(:recently_closed_mr) { create(:merge_request, :closed) }
let(:closed_recently) { recently_closed_mr.merge_request_diff }
set(:recently_merged_mr) { create(:merge_request, :merged) }
let(:merged_recently) { recently_merged_mr.merge_request_diff }
before do
merge_request.update!(latest_merge_request_diff: latest)
end
subject { described_class.ids_for_external_storage_migration(limit: 1000) }
context 'external diffs are disabled' do
before do
stub_external_diffs_setting(enabled: false)
end
it { is_expected.to be_empty }
end
context 'external diffs are misconfigured' do
before do
stub_external_diffs_setting(enabled: true, when: 'every second tuesday')
end
it { is_expected.to be_empty }
end
context 'external diffs are enabled unconditionally' do
before do
stub_external_diffs_setting(enabled: true)
end
it { is_expected.to contain_exactly(outdated.id, latest.id, closed.id, merged.id, closed_recently.id, merged_recently.id) }
end
context 'external diffs are enabled for outdated diffs' do
before do
stub_external_diffs_setting(enabled: true, when: 'outdated')
end
it 'returns records for outdated merge request versions' do
is_expected.to contain_exactly(outdated.id, closed.id, merged.id)
end
end
context 'with limit' do
it 'respects the limit' do
stub_external_diffs_setting(enabled: true)
expect(described_class.ids_for_external_storage_migration(limit: 3).count).to eq(3)
end
end
end
describe '#migrate_files_to_external_storage!' do
let(:diff) { create(:merge_request).merge_request_diff }
it 'converts from in-database to external storage' do
expect(diff).not_to be_stored_externally
stub_external_diffs_setting(enabled: true)
expect(diff).to receive(:save!)
diff.migrate_files_to_external_storage!
expect(diff).to be_stored_externally
end
it 'does nothing with an external diff' do
stub_external_diffs_setting(enabled: true)
expect(diff).to be_stored_externally
expect(diff).not_to receive(:save!)
diff.migrate_files_to_external_storage!
end
it 'does nothing if external diffs are disabled' do
expect(diff).not_to be_stored_externally
expect(diff).not_to receive(:save!)
diff.migrate_files_to_external_storage!
end
end
describe '#latest?' do
let!(:mr) { create(:merge_request, :with_diffs) }
let!(:first_diff) { mr.merge_request_diff }
let!(:last_diff) { mr.create_merge_request_diff }
......@@ -222,12 +319,56 @@ describe MergeRequestDiff do
include_examples 'merge request diffs'
end
describe 'external diffs configured' do
describe 'external diffs always enabled' do
before do
stub_external_diffs_setting(enabled: true)
stub_external_diffs_setting(enabled: true, when: 'always')
end
include_examples 'merge request diffs'
end
describe 'exernal diffs enabled for outdated diffs' do
before do
stub_external_diffs_setting(enabled: true, when: 'outdated')
end
include_examples 'merge request diffs'
it 'stores up-to-date diffs in the database' do
expect(diff).not_to be_stored_externally
end
it 'stores diffs for recently closed MRs in the database' do
mr = create(:merge_request, :closed)
expect(mr.merge_request_diff).not_to be_stored_externally
end
it 'stores diffs for recently merged MRs in the database' do
mr = create(:merge_request, :merged)
expect(mr.merge_request_diff).not_to be_stored_externally
end
it 'stores diffs for old MR versions in external storage' do
old_diff = diff
merge_request.create_merge_request_diff
old_diff.migrate_files_to_external_storage!
expect(old_diff).to be_stored_externally
end
it 'stores diffs for old closed MRs in external storage' do
mr = create(:merge_request, :closed_last_month)
expect(mr.merge_request_diff).to be_stored_externally
end
it 'stores diffs for old merged MRs in external storage' do
mr = create(:merge_request, :merged_last_month)
expect(mr.merge_request_diff).to be_stored_externally
end
end
describe '#commit_shas' do
......
# frozen_string_literal: true
require 'spec_helper'
describe MergeRequests::MigrateExternalDiffsService do
let(:merge_request) { create(:merge_request) }
let(:diff) { merge_request.merge_request_diff }
describe '.enqueue!', :sidekiq do
around do |example|
Sidekiq::Testing.fake! { example.run }
end
it 'enqueues nothing if external diffs are disabled' do
expect(diff).not_to be_stored_externally
expect { described_class.enqueue! }
.not_to change { MigrateExternalDiffsWorker.jobs.count }
end
it 'enqueues eligible in-database diffs if external diffs are enabled' do
expect(diff).not_to be_stored_externally
stub_external_diffs_setting(enabled: true)
expect { described_class.enqueue! }
.to change { MigrateExternalDiffsWorker.jobs.count }
.by(1)
end
end
describe '#execute' do
it 'migrates an in-database diff to the external store' do
expect(diff).not_to be_stored_externally
stub_external_diffs_setting(enabled: true)
described_class.new(diff).execute
expect(diff).to be_stored_externally
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe MigrateExternalDiffsWorker do
let(:worker) { described_class.new }
let(:diff) { create(:merge_request).merge_request_diff }
describe '#perform' do
it 'migrates the listed diff' do
expect_next_instance_of(MergeRequests::MigrateExternalDiffsService) do |instance|
expect(instance.diff).to eq(diff)
expect(instance).to receive(:execute)
end
worker.perform(diff.id)
end
it 'does nothing if the diff is missing' do
diff.destroy
worker.perform(diff.id)
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe ScheduleMigrateExternalDiffsWorker do
include ExclusiveLeaseHelpers
let(:worker) { described_class.new }
describe '#perform' do
it 'triggers a scan for diffs to migrate' do
expect(MergeRequests::MigrateExternalDiffsService).to receive(:enqueue!)
worker.perform
end
it 'will not run if the lease is already taken' do
stub_exclusive_lease_taken('schedule_migrate_external_diffs_worker', timeout: 2.hours)
expect(MergeRequests::MigrateExternalDiffsService).not_to receive(:enqueue!)
worker.perform
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment