Commit 591380a3 authored by Sean McGivern's avatar Sean McGivern

Merge branch '52568-external-mr-diffs' into 'master'

Allow merge request diffs to be placed into an object store

Closes #52568

See merge request gitlab-org/gitlab-ce!24276
parents d0187de2 f9e41d0d
......@@ -7,6 +7,7 @@ class MergeRequestDiff < ActiveRecord::Base
include IgnorableColumn
include EachBatch
include Gitlab::Utils::StrongMemoize
include ObjectStorage::BackgroundMove
# Don't display more than 100 commits at once
COMMITS_SAFE_SIZE = 100
......@@ -15,9 +16,13 @@ class MergeRequestDiff < ActiveRecord::Base
:st_diffs
belongs_to :merge_request
manual_inverse_association :merge_request, :merge_request_diff
has_many :merge_request_diff_files, -> { order(:merge_request_diff_id, :relative_order) }
has_many :merge_request_diff_files,
-> { order(:merge_request_diff_id, :relative_order) },
inverse_of: :merge_request_diff
has_many :merge_request_diff_commits, -> { order(:merge_request_diff_id, :relative_order) }
state_machine :state, initial: :empty do
......@@ -45,10 +50,14 @@ class MergeRequestDiff < ActiveRecord::Base
scope :recent, -> { order(id: :desc).limit(100) }
mount_uploader :external_diff, ExternalDiffUploader
# All diff information is collected from repository after object is created.
# It allows you to override variables like head_commit_sha before getting diff.
after_create :save_git_content, unless: :importing?
after_save :update_external_diff_store, if: :external_diff_changed?
def self.find_by_diff_refs(diff_refs)
find_by(start_commit_sha: diff_refs.start_sha, head_commit_sha: diff_refs.head_sha, base_commit_sha: diff_refs.base_sha)
end
......@@ -241,10 +250,97 @@ class MergeRequestDiff < ActiveRecord::Base
end
end
# Carrierwave defines `write_uploader` dynamically on this class, so `super`
# does not work. Alias the carrierwave method so we can call it when needed
alias_method :carrierwave_write_uploader, :write_uploader
# The `external_diff`, `external_diff_store`, and `stored_externally`
# columns were introduced in GitLab 11.8, but some background migration specs
# use factories that rely on current code with an old schema. Without these
# `has_attribute?` guards, they fail with a `MissingAttributeError`.
#
# For more details, see: https://gitlab.com/gitlab-org/gitlab-ce/issues/44990
def write_uploader(column, identifier)
carrierwave_write_uploader(column, identifier) if has_attribute?(column)
end
def update_external_diff_store
update_column(:external_diff_store, external_diff.object_store) if
has_attribute?(:external_diff_store)
end
def external_diff_changed?
super if has_attribute?(:external_diff)
end
def stored_externally
super if has_attribute?(:stored_externally)
end
alias_method :stored_externally?, :stored_externally
# If enabled, yields the external file containing the diff. Otherwise, yields
# nil. This method is not thread-safe, but it *is* re-entrant, which allows
# multiple merge_request_diff_files to load their data efficiently
def opening_external_diff
return yield(nil) unless stored_externally?
return yield(@external_diff_file) if @external_diff_file
external_diff.open do |file|
begin
@external_diff_file = file
yield(@external_diff_file)
ensure
@external_diff_file = nil
end
end
end
private
def create_merge_request_diff_files(diffs)
rows = diffs.map.with_index do |diff, index|
rows =
if has_attribute?(:external_diff) && Gitlab.config.external_diffs.enabled
build_external_merge_request_diff_files(diffs)
else
build_merge_request_diff_files(diffs)
end
# Faster inserts
Gitlab::Database.bulk_insert('merge_request_diff_files', rows)
end
def build_external_merge_request_diff_files(diffs)
rows = build_merge_request_diff_files(diffs)
tempfile = build_external_diff_tempfile(rows)
self.external_diff = tempfile
self.stored_externally = true
rows
ensure
tempfile&.unlink
end
def build_external_diff_tempfile(rows)
Tempfile.open(external_diff.filename) do |file|
rows.inject(0) do |offset, row|
data = row.delete(:diff)
row[:external_diff_offset] = offset
row[:external_diff_size] = data.size
file.write(data)
offset + data.size
end
file
end
end
def build_merge_request_diff_files(diffs)
diffs.map.with_index do |diff, index|
diff_hash = diff.to_hash.merge(
binary: false,
merge_request_diff_id: self.id,
......@@ -261,11 +357,12 @@ class MergeRequestDiff < ActiveRecord::Base
end
end
end
Gitlab::Database.bulk_insert('merge_request_diff_files', rows)
end
def load_diffs(options)
# Ensure all diff files operate on the same external diff file instance if
# present. This reduces file open/close overhead.
opening_external_diff do
collection = merge_request_diff_files
if paths = options[:paths]
......@@ -274,6 +371,7 @@ class MergeRequestDiff < ActiveRecord::Base
Gitlab::Git::DiffCollection.new(collection.map(&:to_hash), options)
end
end
def load_commits
commits = merge_request_diff_commits.map { |commit| Commit.from_hash(commit.to_hash, project) }
......
......@@ -4,7 +4,7 @@ class MergeRequestDiffFile < ActiveRecord::Base
include Gitlab::EncodingHelper
include DiffFile
belongs_to :merge_request_diff
belongs_to :merge_request_diff, inverse_of: :merge_request_diff_files
def utf8_diff
return '' if diff.blank?
......@@ -13,6 +13,16 @@ class MergeRequestDiffFile < ActiveRecord::Base
end
def diff
binary? ? super.unpack('m0').first : super
content =
if merge_request_diff&.stored_externally?
merge_request_diff.opening_external_diff do |file|
file.seek(external_diff_offset)
file.read(external_diff_size)
end
else
super
end
binary? ? content.unpack('m0').first : content
end
end
# frozen_string_literal: true
class ExternalDiffUploader < GitlabUploader
include ObjectStorage::Concern
storage_options Gitlab.config.external_diffs
alias_method :upload, :model
def filename
"diff-#{model.id}"
end
def store_dir
dynamic_segment
end
private
def dynamic_segment
File.join(model.model_name.plural, "mr-#{model.merge_request_id}")
end
end
---
title: Allow merge request diffs to be placed into an object store
merge_request: 24276
author:
type: added
......@@ -166,6 +166,23 @@ production: &base
# aws_signature_version: 4 # For creation of signed URLs. Set to 2 if provider does not support v4.
# endpoint: 'https://s3.amazonaws.com' # default: nil - Useful for S3 compliant services such as DigitalOcean Spaces
## Merge request external diff storage
external_diffs:
# If disabled (the default), the diffs are in-database. Otherwise, they can
# be stored on disk, or in object storage
enabled: false
# The location where external diffs are stored (default: shared/lfs-external-diffs).
# storage_path: shared/external-diffs
# object_store:
# enabled: false
# remote_directory: external-diffs
# background_upload: false
# proxy_download: false
# connection:
# provider: AWS
# aws_access_key_id: AWS_ACCESS_KEY_ID
# aws_secret_access_key: AWS_SECRET_ACCESS_KEY
# region: us-east-1
## Git LFS
lfs:
......@@ -733,6 +750,18 @@ test:
<<: *base
gravatar:
enabled: true
external_diffs:
enabled: false
# The location where external diffs are stored (default: shared/external-diffs).
# storage_path: shared/external-diffs
object_store:
enabled: false
remote_directory: external-diffs # The bucket name
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: us-east-1
lfs:
enabled: false
# The location where LFS objects are stored (default: shared/lfs-objects).
......
......@@ -215,6 +215,14 @@ Settings.pages['artifacts_server'] ||= Settings.pages['enabled'] if Settings.pa
Settings.pages['admin'] ||= Settingslogic.new({})
Settings.pages.admin['certificate'] ||= ''
#
# External merge request diffs
#
Settings['external_diffs'] ||= Settingslogic.new({})
Settings.external_diffs['enabled'] = false if Settings.external_diffs['enabled'].nil?
Settings.external_diffs['storage_path'] = Settings.absolute(Settings.external_diffs['storage_path'] || File.join(Settings.shared['path'], 'external-diffs'))
Settings.external_diffs['object_store'] = ObjectStoreSettings.parse(Settings.external_diffs['object_store'])
#
# Git LFS
#
......
# frozen_string_literal: true
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddMergeRequestExternalDiffs < ActiveRecord::Migration[5.0]
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
def change
# Allow the merge request diff to store details about an external file
add_column :merge_request_diffs, :external_diff, :string
add_column :merge_request_diffs, :external_diff_store, :integer
add_column :merge_request_diffs, :stored_externally, :boolean
# The diff for each file is mapped to a range in the external file
add_column :merge_request_diff_files, :external_diff_offset, :integer
add_column :merge_request_diff_files, :external_diff_size, :integer
# If the diff is in object storage, it will be null in the database
change_column_null :merge_request_diff_files, :diff, true
end
end
......@@ -1203,8 +1203,10 @@ ActiveRecord::Schema.define(version: 20190131122559) do
t.string "b_mode", null: false
t.text "new_path", null: false
t.text "old_path", null: false
t.text "diff", null: false
t.text "diff"
t.boolean "binary"
t.integer "external_diff_offset"
t.integer "external_diff_size"
t.index ["merge_request_diff_id", "relative_order"], name: "index_merge_request_diff_files_on_mr_diff_id_and_order", unique: true, using: :btree
end
......@@ -1218,6 +1220,9 @@ ActiveRecord::Schema.define(version: 20190131122559) do
t.string "head_commit_sha"
t.string "start_commit_sha"
t.integer "commits_count"
t.string "external_diff"
t.integer "external_diff_store"
t.boolean "stored_externally"
t.index ["merge_request_id", "id"], name: "index_merge_request_diffs_on_merge_request_id_and_id", using: :btree
end
......
......@@ -48,6 +48,7 @@ Learn how to install, configure, update, and maintain your GitLab instance.
- [Third party offers](../user/admin_area/settings/third_party_offers.md)
- [Compliance](compliance.md): A collection of features from across the application that you may configure to help ensure that your GitLab instance and DevOps workflow meet compliance standards.
- [Diff limits](../user/admin_area/diff_limits.md): Configure the diff rendering size limits of branch comparison pages.
- [Merge request diffs](merge_request_diffs.md): Configure the diffs shown on merge requests
- [Broadcast Messages](../user/admin_area/broadcast_messages.md): Send messages to GitLab users through the UI.
#### Customizing GitLab's appearance
......
# Merge request diffs administration
> **Notes:**
> - External merge request diffs introduced in GitLab 11.8
Merge request diffs are size-limited copies of diffs associated with merge
requests. When viewing a merge request, diffs are sourced from these copies
wherever possible as a performance optimization.
By default, merge request diffs are stored in the database, in a table named
`merge_request_diff_files`. Larger installations may find this table grows too
large, in which case, switching to external storage is recommended.
### Using external storage
Merge request diffs can be stored on disk, or in object storage. In general, it
is better to store the diffs in the database than on disk.
To enable external storage of merge request diffs:
---
**In Omnibus installations:**
1. Edit `/etc/gitlab/gitlab.rb` and add the following line:
```ruby
gitlab_rails['external_diffs_enabled'] = true
```
1. _The external diffs will be stored in in
`/var/opt/gitlab/gitlab-rails/shared/external-diffs`._ To change the path,
for example to `/mnt/storage/external-diffs`, edit `/etc/gitlab/gitlab.rb`
and add the following line:
```ruby
gitlab_rails['external_diffs_storage_path'] = "/mnt/storage/external-diffs"
```
1. Save the file and [reconfigure GitLab][] for the changes to take effect.
---
**In installations from source:**
1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following
lines:
```yaml
external_diffs:
enabled: true
```
1. _The external diffs will be stored in
`/home/git/gitlab/shared/external-diffs`._ To change the path, for example
to `/mnt/storage/external-diffs`, edit `/home/git/gitlab/config/gitlab.yml`
and add or amend the following lines:
```yaml
external_diffs:
enabled: true
storage_path: /mnt/storage/external-diffs
```
1. Save the file and [restart GitLab][] for the changes to take effect.
### Using object storage
Instead of storing the external diffs on disk, we recommended you use an object
store like AWS S3 instead. This configuration relies on valid AWS credentials to
be configured already.
### Object Storage Settings
For source installations, these settings are nested under `external_diffs:` and
then `object_store:`. On omnibus installs, they are prefixed by
`external_diffs_object_store_`.
| Setting | Description | Default |
|---------|-------------|---------|
| `enabled` | Enable/disable object storage | `false` |
| `remote_directory` | The bucket name where external diffs will be stored| |
| `direct_upload` | Set to true to enable direct upload of external diffs without the need of local shared storage. Option may be removed once we decide to support only single storage for all files. | `false` |
| `background_upload` | Set to false to disable automatic upload. Option may be removed once upload is direct to S3 | `true` |
| `proxy_download` | Set to true to enable proxying all files served. Option allows to reduce egress traffic as this allows clients to download directly from remote storage instead of proxying all data | `false` |
| `connection` | Various connection options described below | |
#### S3 compatible connection settings
The connection settings match those provided by [Fog](https://github.com/fog), and are as follows:
| Setting | Description | Default |
|---------|-------------|---------|
| `provider` | Always `AWS` for compatible hosts | AWS |
| `aws_access_key_id` | AWS credentials, or compatible | |
| `aws_secret_access_key` | AWS credentials, or compatible | |
| `aws_signature_version` | AWS signature version to use. 2 or 4 are valid options. Digital Ocean Spaces and other providers may need 2. | 4 |
| `region` | AWS region | us-east-1 |
| `host` | S3 compatible host for when not using AWS, e.g. `localhost` or `storage.example.com` | s3.amazonaws.com |
| `endpoint` | Can be used when configuring an S3 compatible service such as [Minio](https://www.minio.io), by entering a URL such as `http://127.0.0.1:9000` | (optional) |
| `path_style` | Set to true to use `host/bucket_name/object` style paths instead of `bucket_name.host/object`. Leave as false for AWS S3 | false |
| `use_iam_profile` | Set to true to use IAM profile instead of access keys | false
**In Omnibus installations:**
1. Edit `/etc/gitlab/gitlab.rb` and add the following lines by replacing with
the values you want:
```ruby
gitlab_rails['external_diffs_enabled'] = true
gitlab_rails['external_diffs_object_store_enabled'] = true
gitlab_rails['external_diffs_object_store_remote_directory'] = "external-diffs"
gitlab_rails['external_diffs_object_store_connection'] = {
'provider' => 'AWS',
'region' => 'eu-central-1',
'aws_access_key_id' => 'AWS_ACCESS_KEY_ID',
'aws_secret_access_key' => 'AWS_SECRET_ACCESS_KEY'
}
```
NOTE: if you are using AWS IAM profiles, be sure to omit the
AWS access key and secret access key/value pairs. For example:
```ruby
gitlab_rails['external_diffs_object_store_connection'] = {
'provider' => 'AWS',
'region' => 'eu-central-1',
'use_iam_profile' => true
}
```
1. Save the file and [reconfigure GitLab][] for the changes to take effect.
---
**In installations from source:**
1. Edit `/home/git/gitlab/config/gitlab.yml` and add or amend the following
lines:
```yaml
external_diffs:
enabled: true
object_store:
enabled: true
remote_directory: "external-diffs" # The bucket name
connection:
provider: AWS # Only AWS supported at the moment
aws_access_key_id: AWS_ACCESS_KEY_ID
aws_secret_access_key: AWS_SECRET_ACCESS_KEY
region: eu-central-1
```
1. Save the file and [restart GitLab][] for the changes to take effect.
......@@ -18,6 +18,7 @@ There are many places where file uploading is used, according to contexts:
- Issues/MR/Notes Legacy Markdown attachments
- CI Artifacts (archive, metadata, trace)
- LFS Objects
- Merge request diffs
## Disk storage
......@@ -37,6 +38,7 @@ they are still not 100% standardized. You can see them below:
| Issues/MR/Notes Legacy Markdown attachments | no | uploads/-/system/note/attachment/:id/:filename | `AttachmentUploader` | Note |
| CI Artifacts (CE) | yes | shared/artifacts/:disk_hash[0..1]/:disk_hash[2..3]/:disk_hash/:year_:month_:date/:job_id/:job_artifact_id (:disk_hash is SHA256 digest of project_id) | `JobArtifactUploader` | Ci::JobArtifact |
| LFS Objects (CE) | yes | shared/lfs-objects/:hex/:hex/:object_hash | `LfsObjectUploader` | LfsObject |
| External merge request diffs | yes | shared/external-diffs/merge_request_diffs/mr-:parent_id/diff-:id | `ExternalDiffUploader` | MergeRequestDiff |
CI Artifacts and LFS Objects behave differently in CE and EE. In CE they inherit the `GitlabUploader`
while in EE they inherit the `ObjectStorage` and store files in and S3 API compatible object store.
......
......@@ -130,9 +130,14 @@ excluded_attributes:
snippets:
- :expired_at
merge_request_diff:
- :external_diff
- :stored_externally
- :external_diff_store
- :st_diffs
merge_request_diff_files:
- :diff
- :external_diff_offset
- :external_diff_size
issues:
- :milestone_id
merge_requests:
......
......@@ -46,7 +46,7 @@ describe MergeRequestDiff do
it { expect(first_diff.reload).not_to be_latest }
end
describe '#diffs' do
shared_examples_for 'merge request diffs' do
let(:merge_request) { create(:merge_request, :with_diffs) }
let!(:diff) { merge_request.merge_request_diff.reload }
......@@ -91,11 +91,10 @@ describe MergeRequestDiff do
diff.diffs.diff_files
end
end
end
describe '#raw_diffs' do
context 'when the :ignore_whitespace_change option is set' do
it 'creates a new compare object instead of loading from the DB' do
it 'creates a new compare object instead of using preprocessed data' do
expect(diff_with_commits).not_to receive(:load_diffs)
expect(diff_with_commits.compare).to receive(:diffs).and_call_original
......@@ -134,7 +133,7 @@ describe MergeRequestDiff do
diffs
end
it 'uses the diffs from the DB' do
it 'uses the preprocessed diffs' do
expect(diff_with_commits).to receive(:load_diffs)
diffs
......@@ -184,6 +183,19 @@ describe MergeRequestDiff do
expect(diff_file.diff).to eq(mr_diff.compare.diffs(paths: [path]).to_a.first.diff)
end
end
end
describe 'internal diffs configured' do
include_examples 'merge request diffs'
end
describe 'external diffs configured' do
before do
stub_external_diffs_setting(enabled: true)
end
include_examples 'merge request diffs'
end
describe '#commit_shas' do
it 'returns all commit SHAs using commits from the DB' do
......@@ -245,4 +257,55 @@ describe MergeRequestDiff do
expect(subject.modified_paths).to eq(%w{foo bar baz})
end
end
describe '#opening_external_diff' do
subject(:diff) { diff_with_commits }
context 'external diffs disabled' do
it { expect(diff.external_diff).not_to be_exists }
it 'yields nil' do
expect { |b| diff.opening_external_diff(&b) }.to yield_with_args(nil)
end
end
context 'external diffs enabled' do
let(:test_dir) { 'tmp/tests/external-diffs' }
around do |example|
FileUtils.mkdir_p(test_dir)
begin
example.run
ensure
FileUtils.rm_rf(test_dir)
end
end
before do
stub_external_diffs_setting(enabled: true, storage_path: test_dir)
end
it { expect(diff.external_diff).to be_exists }
it 'yields an open file' do
expect { |b| diff.opening_external_diff(&b) }.to yield_with_args(File)
end
it 'is re-entrant' do
outer_file_a =
diff.opening_external_diff do |outer_file|
diff.opening_external_diff do |inner_file|
expect(outer_file).to eq(inner_file)
end
outer_file
end
diff.opening_external_diff do |outer_file_b|
expect(outer_file_a).not_to eq(outer_file_b)
end
end
end
end
end
......@@ -56,6 +56,10 @@ module StubConfiguration
allow(Gitlab.config.lfs).to receive_messages(to_settings(messages))
end
def stub_external_diffs_setting(messages)
allow(Gitlab.config.external_diffs).to receive_messages(to_settings(messages))
end
def stub_artifacts_setting(messages)
allow(Gitlab.config.artifacts).to receive_messages(to_settings(messages))
end
......
......@@ -42,6 +42,13 @@ module StubObjectStorage
**params)
end
def stub_external_diffs_object_storage(uploader = described_class, **params)
stub_object_storage_uploader(config: Gitlab.config.external_diffs.object_store,
uploader: uploader,
remote_directory: 'external_diffs',
**params)
end
def stub_lfs_object_storage(**params)
stub_object_storage_uploader(config: Gitlab.config.lfs.object_store,
uploader: LfsObjectUploader,
......
require 'spec_helper'
describe ExternalDiffUploader do
let(:diff) { create(:merge_request).merge_request_diff }
let(:path) { Gitlab.config.external_diffs.storage_path }
subject(:uploader) { described_class.new(diff, :external_diff) }
it_behaves_like "builds correct paths",
store_dir: %r[merge_request_diffs/mr-\d+],
cache_dir: %r[/external-diffs/tmp/cache],
work_dir: %r[/external-diffs/tmp/work]
context "object store is REMOTE" do
before do
stub_external_diffs_object_storage
end
include_context 'with storage', described_class::Store::REMOTE
it_behaves_like "builds correct paths",
store_dir: %r[merge_request_diffs/mr-\d+]
end
describe 'migration to object storage' do
context 'with object storage disabled' do
it "is skipped" do
expect(ObjectStorage::BackgroundMoveWorker).not_to receive(:perform_async)
diff
end
end
context 'with object storage enabled' do
before do
stub_external_diffs_setting(enabled: true)
stub_external_diffs_object_storage(background_upload: true)
end
it 'is scheduled to run after creation' do
expect(ObjectStorage::BackgroundMoveWorker).to receive(:perform_async).with(described_class.name, 'MergeRequestDiff', :external_diff, kind_of(Numeric))
diff
end
end
end
describe 'remote file' do
context 'with object storage enabled' do
before do
stub_external_diffs_setting(enabled: true)
stub_external_diffs_object_storage
diff.update!(external_diff_store: described_class::Store::REMOTE)
end
it 'can store file remotely' do
allow(ObjectStorage::BackgroundMoveWorker).to receive(:perform_async)
diff
expect(diff.external_diff_store).to eq(described_class::Store::REMOTE)
expect(diff.external_diff.path).not_to be_blank
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment