Commit 61b0577d authored by Stan Hu's avatar Stan Hu

Merge branch 'mk/rake-task-verify-remote-files' into 'master'

Add support for verifying remote uploads, artifacts, and LFS objects in check rake tasks

Closes #47139

See merge request gitlab-org/gitlab-ce!19501
parents cc1b0354 2f40fb45
---
title: Add support for verifying remote uploads, artifacts, and LFS objects in check rake tasks
merge_request: 19501
author:
type: added
...@@ -78,9 +78,10 @@ Example output: ...@@ -78,9 +78,10 @@ Example output:
## Uploaded Files Integrity ## Uploaded Files Integrity
Various types of file can be uploaded to a GitLab installation by users. Various types of files can be uploaded to a GitLab installation by users.
Checksums are generated and stored in the database upon upload, and integrity These integrity checks can detect missing files. Additionally, for locally
checks using those checksums can be run. These checks also detect missing files. stored files, checksums are generated and stored in the database upon upload,
and these checks will verify them against current files.
Currently, integrity checks are supported for the following types of file: Currently, integrity checks are supported for the following types of file:
......
...@@ -7,13 +7,15 @@ module Gitlab ...@@ -7,13 +7,15 @@ module Gitlab
@batch_size = batch_size @batch_size = batch_size
@start = start @start = start
@finish = finish @finish = finish
fix_google_api_logger
end end
# Yields a Range of IDs and a Hash of failed verifications (object => error) # Yields a Range of IDs and a Hash of failed verifications (object => error)
def run_batches(&blk) def run_batches(&blk)
relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches all_relation.in_batches(of: batch_size, start: start, finish: finish) do |batch| # rubocop: disable Cop/InBatches
range = relation.first.id..relation.last.id range = batch.first.id..batch.last.id
failures = run_batch(relation) failures = run_batch_for(batch)
yield(range, failures) yield(range, failures)
end end
...@@ -29,24 +31,56 @@ module Gitlab ...@@ -29,24 +31,56 @@ module Gitlab
private private
def run_batch(relation) def run_batch_for(batch)
relation.map { |upload| verify(upload) }.compact.to_h batch.map { |upload| verify(upload) }.compact.to_h
end end
def verify(object) def verify(object)
local?(object) ? verify_local(object) : verify_remote(object)
rescue => err
failure(object, err.inspect)
end
def verify_local(object)
expected = expected_checksum(object) expected = expected_checksum(object)
actual = actual_checksum(object) actual = actual_checksum(object)
raise 'Checksum missing' unless expected.present? return failure(object, 'Checksum missing') unless expected.present?
raise 'Checksum mismatch' unless expected == actual return failure(object, 'Checksum mismatch') unless expected == actual
success
end
# We don't calculate checksum for remote objects, so just check existence
def verify_remote(object)
return failure(object, 'Remote object does not exist') unless remote_object_exists?(object)
success
end
def success
nil nil
rescue => err end
[object, err]
def failure(object, message)
[object, message]
end
# It's already set to Logger::INFO, but acts as if it is set to
# Logger::DEBUG, and this fixes it...
def fix_google_api_logger
if Object.const_defined?('Google::Apis')
Google::Apis.logger.level = Logger::INFO
end
end end
# This should return an ActiveRecord::Relation suitable for calling #in_batches on # This should return an ActiveRecord::Relation suitable for calling #in_batches on
def relation def all_relation
raise NotImplementedError.new
end
# Should return true if the object is stored locally
def local?(_object)
raise NotImplementedError.new raise NotImplementedError.new
end end
...@@ -59,6 +93,11 @@ module Gitlab ...@@ -59,6 +93,11 @@ module Gitlab
def actual_checksum(_object) def actual_checksum(_object)
raise NotImplementedError.new raise NotImplementedError.new
end end
# Be sure to perform a hard check of the remote object (don't just check DB value)
def remote_object_exists?(object)
raise NotImplementedError.new
end
end end
end end
end end
...@@ -11,10 +11,14 @@ module Gitlab ...@@ -11,10 +11,14 @@ module Gitlab
private private
def relation def all_relation
::Ci::JobArtifact.all ::Ci::JobArtifact.all
end end
def local?(artifact)
artifact.local_store?
end
def expected_checksum(artifact) def expected_checksum(artifact)
artifact.file_sha256 artifact.file_sha256
end end
...@@ -22,6 +26,10 @@ module Gitlab ...@@ -22,6 +26,10 @@ module Gitlab
def actual_checksum(artifact) def actual_checksum(artifact)
Digest::SHA256.file(artifact.file.path).hexdigest Digest::SHA256.file(artifact.file.path).hexdigest
end end
def remote_object_exists?(artifact)
artifact.file.file.exists?
end
end end
end end
end end
...@@ -11,8 +11,12 @@ module Gitlab ...@@ -11,8 +11,12 @@ module Gitlab
private private
def relation def all_relation
LfsObject.with_files_stored_locally LfsObject.all
end
def local?(lfs_object)
lfs_object.local_store?
end end
def expected_checksum(lfs_object) def expected_checksum(lfs_object)
...@@ -22,6 +26,10 @@ module Gitlab ...@@ -22,6 +26,10 @@ module Gitlab
def actual_checksum(lfs_object) def actual_checksum(lfs_object)
LfsObject.calculate_oid(lfs_object.file.path) LfsObject.calculate_oid(lfs_object.file.path)
end end
def remote_object_exists?(lfs_object)
lfs_object.file.file.exists?
end
end end
end end
end end
...@@ -45,7 +45,7 @@ module Gitlab ...@@ -45,7 +45,7 @@ module Gitlab
return unless verbose? return unless verbose?
failures.each do |object, error| failures.each do |object, error|
say " - #{verifier.describe(object)}: #{error.inspect}".color(:red) say " - #{verifier.describe(object)}: #{error}".color(:red)
end end
end end
end end
......
...@@ -11,8 +11,12 @@ module Gitlab ...@@ -11,8 +11,12 @@ module Gitlab
private private
def relation def all_relation
Upload.with_files_stored_locally Upload.all
end
def local?(upload)
upload.local?
end end
def expected_checksum(upload) def expected_checksum(upload)
...@@ -22,6 +26,10 @@ module Gitlab ...@@ -22,6 +26,10 @@ module Gitlab
def actual_checksum(upload) def actual_checksum(upload)
Upload.hexdigest(upload.absolute_path) Upload.hexdigest(upload.absolute_path)
end end
def remote_object_exists?(upload)
upload.build_uploader.file.exists?
end
end end
end end
end end
...@@ -21,15 +21,38 @@ describe Gitlab::Verify::JobArtifacts do ...@@ -21,15 +21,38 @@ describe Gitlab::Verify::JobArtifacts do
FileUtils.rm_f(artifact.file.path) FileUtils.rm_f(artifact.file.path)
expect(failures.keys).to contain_exactly(artifact) expect(failures.keys).to contain_exactly(artifact)
expect(failure).to be_a(Errno::ENOENT) expect(failure).to include('No such file or directory')
expect(failure.to_s).to include(artifact.file.path) expect(failure).to include(artifact.file.path)
end end
it 'fails artifacts with a mismatched checksum' do it 'fails artifacts with a mismatched checksum' do
File.truncate(artifact.file.path, 0) File.truncate(artifact.file.path, 0)
expect(failures.keys).to contain_exactly(artifact) expect(failures.keys).to contain_exactly(artifact)
expect(failure.to_s).to include('Checksum mismatch') expect(failure).to include('Checksum mismatch')
end
context 'with remote files' do
let(:file) { double(:file) }
before do
stub_artifacts_object_storage
artifact.update!(file_store: ObjectStorage::Store::REMOTE)
expect(CarrierWave::Storage::Fog::File).to receive(:new).and_return(file)
end
it 'passes artifacts in object storage that exist' do
expect(file).to receive(:exists?).and_return(true)
expect(failures).to eq({})
end
it 'fails artifacts in object storage that do not exist' do
expect(file).to receive(:exists?).and_return(false)
expect(failures.keys).to contain_exactly(artifact)
expect(failure).to include('Remote object does not exist')
end
end end
end end
end end
...@@ -21,30 +21,37 @@ describe Gitlab::Verify::LfsObjects do ...@@ -21,30 +21,37 @@ describe Gitlab::Verify::LfsObjects do
FileUtils.rm_f(lfs_object.file.path) FileUtils.rm_f(lfs_object.file.path)
expect(failures.keys).to contain_exactly(lfs_object) expect(failures.keys).to contain_exactly(lfs_object)
expect(failure).to be_a(Errno::ENOENT) expect(failure).to include('No such file or directory')
expect(failure.to_s).to include(lfs_object.file.path) expect(failure).to include(lfs_object.file.path)
end end
it 'fails LFS objects with a mismatched oid' do it 'fails LFS objects with a mismatched oid' do
File.truncate(lfs_object.file.path, 0) File.truncate(lfs_object.file.path, 0)
expect(failures.keys).to contain_exactly(lfs_object) expect(failures.keys).to contain_exactly(lfs_object)
expect(failure.to_s).to include('Checksum mismatch') expect(failure).to include('Checksum mismatch')
end end
context 'with remote files' do context 'with remote files' do
let(:file) { double(:file) }
before do before do
stub_lfs_object_storage stub_lfs_object_storage
lfs_object.update!(file_store: ObjectStorage::Store::REMOTE)
expect(CarrierWave::Storage::Fog::File).to receive(:new).and_return(file)
end end
it 'skips LFS objects in object storage' do it 'passes LFS objects in object storage that exist' do
local_failure = create(:lfs_object) expect(file).to receive(:exists?).and_return(true)
create(:lfs_object, :object_storage)
expect(failures).to eq({})
end
failures = {} it 'fails LFS objects in object storage that do not exist' do
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) } expect(file).to receive(:exists?).and_return(false)
expect(failures.keys).to contain_exactly(local_failure) expect(failures.keys).to contain_exactly(lfs_object)
expect(failure).to include('Remote object does not exist')
end end
end end
end end
......
...@@ -23,37 +23,44 @@ describe Gitlab::Verify::Uploads do ...@@ -23,37 +23,44 @@ describe Gitlab::Verify::Uploads do
FileUtils.rm_f(upload.absolute_path) FileUtils.rm_f(upload.absolute_path)
expect(failures.keys).to contain_exactly(upload) expect(failures.keys).to contain_exactly(upload)
expect(failure).to be_a(Errno::ENOENT) expect(failure).to include('No such file or directory')
expect(failure.to_s).to include(upload.absolute_path) expect(failure).to include(upload.absolute_path)
end end
it 'fails uploads with a mismatched checksum' do it 'fails uploads with a mismatched checksum' do
upload.update!(checksum: 'something incorrect') upload.update!(checksum: 'something incorrect')
expect(failures.keys).to contain_exactly(upload) expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum mismatch') expect(failure).to include('Checksum mismatch')
end end
it 'fails uploads with a missing precalculated checksum' do it 'fails uploads with a missing precalculated checksum' do
upload.update!(checksum: '') upload.update!(checksum: '')
expect(failures.keys).to contain_exactly(upload) expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum missing') expect(failure).to include('Checksum missing')
end end
context 'with remote files' do context 'with remote files' do
let(:file) { double(:file) }
before do before do
stub_uploads_object_storage(AvatarUploader) stub_uploads_object_storage(AvatarUploader)
upload.update!(store: ObjectStorage::Store::REMOTE)
expect(CarrierWave::Storage::Fog::File).to receive(:new).and_return(file)
end end
it 'skips uploads in object storage' do it 'passes uploads in object storage that exist' do
local_failure = create(:upload) expect(file).to receive(:exists?).and_return(true)
create(:upload, :object_storage)
expect(failures).to eq({})
end
failures = {} it 'fails uploads in object storage that do not exist' do
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) } expect(file).to receive(:exists?).and_return(false)
expect(failures.keys).to contain_exactly(local_failure) expect(failures.keys).to contain_exactly(upload)
expect(failure).to include('Remote object does not exist')
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment