Commit 1618aa49 authored by Sean McGivern's avatar Sean McGivern

Merge branch '4862-verify-file-checksums' into 'master'

Resolve "Geo: Verify locally stored files"

Closes #4862

See merge request gitlab-org/gitlab-ee!4753
parents f0a8d67a d69e7b44
......@@ -18,4 +18,8 @@ class LfsObject < ActiveRecord::Base
.where(lfs_objects_projects: { id: nil })
.destroy_all
end
def self.calculate_oid(path)
Digest::SHA256.file(path).hexdigest
end
end
---
title: Foreground verification of uploads and LFS objects
merge_request: 17402
author:
type: added
......@@ -78,34 +78,41 @@ Example output:
## Uploaded Files Integrity
The uploads check Rake task will loop through all uploads in the database
and run two checks to determine the integrity of each file:
Various types of file can be uploaded to a GitLab installation by users.
Checksums are generated and stored in the database upon upload, and integrity
checks using those checksums can be run. These checks also detect missing files.
1. Check if the file exist on the file system.
1. Check if the checksum of the file on the file system matches the checksum in the database.
Currently, integrity checks are supported for the following types of file:
* LFS objects
* User uploads
**Omnibus Installation**
```
sudo gitlab-rake gitlab:lfs:check
sudo gitlab-rake gitlab:uploads:check
```
**Source Installation**
```bash
sudo -u git -H bundle exec rake gitlab:lfs:check RAILS_ENV=production
sudo -u git -H bundle exec rake gitlab:uploads:check RAILS_ENV=production
```
This task also accepts some environment variables which you can use to override
These tasks also accept some environment variables which you can use to override
certain values:
Variable | Type | Description
-------- | ---- | -----------
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
Variable | Type | Description
--------- | ------- | -----------
`BATCH` | integer | Specifies the size of the batch. Defaults to 200.
`ID_FROM` | integer | Specifies the ID to start from, inclusive of the value.
`ID_TO` | integer | Specifies the ID value to end at, inclusive of the value.
`VERBOSE` | boolean | Causes failures to be listed individually, rather than being summarized.
```bash
sudo gitlab-rake gitlab:lfs:check BATCH=100 ID_FROM=50 ID_TO=250
sudo gitlab-rake gitlab:uploads:check BATCH=100 ID_FROM=50 ID_TO=250
```
......
module EE
module Gitlab
module Verify
module LfsObjects
extend ::Gitlab::Utils::Override
private
override :relation
def relation
super.with_files_stored_locally
end
end
end
end
end
module EE
module Gitlab
module Verify
module Uploads
extend ::Gitlab::Utils::Override
private
override :relation
def relation
super.with_files_stored_locally
end
end
end
end
end
require_relative 'helpers.rb'
namespace :gitlab do
namespace :uploads do
desc 'GitLab | Uploads | Check integrity of uploaded files'
task check: :environment do
include UploadTaskHelpers
puts 'Checking integrity of uploaded files'
uploads_batches do |batch|
batch.each do |upload|
begin
puts "- Checking file (#{upload.id}): #{upload.absolute_path}".color(:green)
if upload.exist?
check_checksum(upload)
else
puts " * File does not exist on the file system".color(:red)
end
rescue ObjectStorage::RemoteStoreError
puts "- File (#{upload.id}): File is stored remotely, skipping".color(:yellow)
end
end
end
puts 'Done!'
end
end
end
module UploadTaskHelpers
def batch_size
ENV.fetch('BATCH', 200).to_i
end
def calculate_checksum(absolute_path)
Digest::SHA256.file(absolute_path).hexdigest
end
def check_checksum(upload)
checksum = calculate_checksum(upload.absolute_path)
if checksum != upload.checksum
puts " * File checksum (#{checksum}) does not match the one in the database (#{upload.checksum})".color(:red)
end
end
def uploads_batches(&block)
Upload.all.in_batches(of: batch_size, start: ENV['ID_FROM'], finish: ENV['ID_TO']) do |relation| # rubocop: disable Cop/InBatches
yield relation
end
end
end
require 'spec_helper'
describe Gitlab::Verify::LfsObjects do
before do
stub_lfs_object_storage
end
it 'skips LFS objects in object storage' do
local_failure = create(:lfs_object)
create(:lfs_object, :object_storage)
failures = {}
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) }
expect(failures.keys).to contain_exactly(local_failure)
end
end
require 'spec_helper'
describe Gitlab::Verify::Uploads do
before do
stub_uploads_object_storage(AvatarUploader)
end
it 'skips uploads in object storage' do
local_failure = create(:upload)
create(:upload, :object_storage)
failures = {}
described_class.new(batch_size: 10).run_batches { |_, failed| failures.merge!(failed) }
expect(failures.keys).to contain_exactly(local_failure)
end
end
......@@ -2,7 +2,7 @@ require 'rake_helper'
describe 'gitlab:lfs namespace rake task' do
before :all do
Rake.application.rake_require 'tasks/gitlab/lfs'
Rake.application.rake_require 'tasks/gitlab/lfs/migrate'
end
describe 'migrate' do
......
require 'rake_helper'
describe 'gitlab:uploads:check rake tasks' do
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
before do
Rake.application.rake_require 'tasks/gitlab/uploads/check'
end
it 'outputs the integrity check for each uploaded file' do
expect { run_rake_task('gitlab:uploads:check') }.to output(/Checking file \(#{upload.id}\): #{Regexp.quote(upload.absolute_path)}/).to_stdout
end
it 'errors out about missing files on the file system' do
create(:upload)
expect { run_rake_task('gitlab:uploads:check') }.to output(/File does not exist on the file system/).to_stdout
end
it 'errors out about invalid checksum' do
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
expect { run_rake_task('gitlab:uploads:check') }.to output(/File checksum \(9e697aa09fe196909813ee36103e34f721fe47a5fdc8aac0e4e4ac47b9b38282\) does not match the one in the database \(#{upload.checksum}\)/).to_stdout
end
end
module Gitlab
module Verify
class BatchVerifier
attr_reader :batch_size, :start, :finish
def initialize(batch_size:, start: nil, finish: nil)
@batch_size = batch_size
@start = start
@finish = finish
end
# Yields a Range of IDs and a Hash of failed verifications (object => error)
def run_batches(&blk)
relation.in_batches(of: batch_size, start: start, finish: finish) do |relation| # rubocop: disable Cop/InBatches
range = relation.first.id..relation.last.id
failures = run_batch(relation)
yield(range, failures)
end
end
def name
raise NotImplementedError.new
end
def describe(_object)
raise NotImplementedError.new
end
private
def run_batch(relation)
relation.map { |upload| verify(upload) }.compact.to_h
end
def verify(object)
expected = expected_checksum(object)
actual = actual_checksum(object)
raise 'Checksum missing' unless expected.present?
raise 'Checksum mismatch' unless expected == actual
nil
rescue => err
[object, err]
end
# This should return an ActiveRecord::Relation suitable for calling #in_batches on
def relation
raise NotImplementedError.new
end
# The checksum we expect the object to have
def expected_checksum(_object)
raise NotImplementedError.new
end
# The freshly-recalculated checksum of the object
def actual_checksum(_object)
raise NotImplementedError.new
end
end
end
end
module Gitlab
module Verify
class LfsObjects < BatchVerifier
prepend ::EE::Gitlab::Verify::LfsObjects
def name
'LFS objects'
end
def describe(object)
"LFS object: #{object.oid}"
end
private
def relation
LfsObject.all
end
def expected_checksum(lfs_object)
lfs_object.oid
end
def actual_checksum(lfs_object)
LfsObject.calculate_oid(lfs_object.file.path)
end
end
end
end
module Gitlab
module Verify
class RakeTask
def self.run!(verify_kls)
verifier = verify_kls.new(
batch_size: ENV.fetch('BATCH', 200).to_i,
start: ENV['ID_FROM'],
finish: ENV['ID_TO']
)
verbose = Gitlab::Utils.to_boolean(ENV['VERBOSE'])
new(verifier, verbose).run!
end
attr_reader :verifier, :output
def initialize(verifier, verbose)
@verifier = verifier
@verbose = verbose
end
def run!
say "Checking integrity of #{verifier.name}"
verifier.run_batches { |*args| run_batch(*args) }
say 'Done!'
end
def verbose?
!!@verbose
end
private
def say(text)
puts(text) # rubocop:disable Rails/Output
end
def run_batch(range, failures)
status_color = failures.empty? ? :green : :red
say "- #{range}: Failures: #{failures.count}".color(status_color)
return unless verbose?
failures.each do |object, error|
say " - #{verifier.describe(object)}: #{error.inspect}".color(:red)
end
end
end
end
end
module Gitlab
module Verify
class Uploads < BatchVerifier
prepend ::EE::Gitlab::Verify::Uploads
def name
'Uploads'
end
def describe(object)
"Upload: #{object.id}"
end
private
def relation
Upload.all
end
def expected_checksum(upload)
upload.checksum
end
def actual_checksum(upload)
Upload.hexdigest(upload.absolute_path)
end
end
end
end
namespace :gitlab do
namespace :lfs do
desc 'GitLab | LFS | Check integrity of uploaded LFS objects'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::LfsObjects)
end
end
end
namespace :gitlab do
namespace :uploads do
desc 'GitLab | Uploads | Check integrity of uploaded files'
task check: :environment do
Gitlab::Verify::RakeTask.run!(Gitlab::Verify::Uploads)
end
end
end
......@@ -9,4 +9,14 @@ FactoryBot.define do
trait :with_file do
file { fixture_file_upload(Rails.root + "spec/fixtures/dk.png", "`/png") }
end
# The uniqueness constraint means we can't use the correct OID for all LFS
# objects, so the test needs to decide which (if any) object gets it
trait :correct_oid do
oid 'b804383982bb89b00e828e3f44c038cc991d3d1768009fc39ba8e2c081b9fb75'
end
trait :object_storage do
file_store { LfsObjectUploader::Store::REMOTE }
end
end
require 'spec_helper'
describe Gitlab::Verify::LfsObjects do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let!(:objects) { create_list(:lfs_object, 3, :with_file) }
end
describe '#run_batches' do
let(:failures) { collect_failures }
let(:failure) { failures[lfs_object] }
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
it 'passes LFS objects with the correct file' do
expect(failures).to eq({})
end
it 'fails LFS objects with a missing file' do
FileUtils.rm_f(lfs_object.file.path)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(lfs_object.file.path)
end
it 'fails LFS objects with a mismatched oid' do
File.truncate(lfs_object.file.path, 0)
expect(failures.keys).to contain_exactly(lfs_object)
expect(failure.to_s).to include('Checksum mismatch')
end
end
end
require 'spec_helper'
describe Gitlab::Verify::Uploads do
include GitlabVerifyHelpers
it_behaves_like 'Gitlab::Verify::BatchVerifier subclass' do
let(:projects) { create_list(:project, 3, :with_avatar) }
let!(:objects) { projects.flat_map(&:uploads) }
end
describe '#run_batches' do
let(:project) { create(:project, :with_avatar) }
let(:failures) { collect_failures }
let(:failure) { failures[upload] }
let!(:upload) { project.uploads.first }
it 'passes uploads with the correct file' do
expect(failures).to eq({})
end
it 'fails uploads with a missing file' do
FileUtils.rm_f(upload.absolute_path)
expect(failures.keys).to contain_exactly(upload)
expect(failure).to be_a(Errno::ENOENT)
expect(failure.to_s).to include(upload.absolute_path)
end
it 'fails uploads with a mismatched checksum' do
upload.update!(checksum: 'something incorrect')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum mismatch')
end
it 'fails uploads with a missing precalculated checksum' do
upload.update!(checksum: '')
expect(failures.keys).to contain_exactly(upload)
expect(failure.to_s).to include('Checksum missing')
end
end
end
RSpec.shared_examples 'Gitlab::Verify::BatchVerifier subclass' do
describe 'batching' do
let(:first_batch) { objects[0].id..objects[0].id }
let(:second_batch) { objects[1].id..objects[1].id }
let(:third_batch) { objects[2].id..objects[2].id }
it 'iterates through objects in batches' do
expect(collect_ranges).to eq([first_batch, second_batch, third_batch])
end
it 'allows the starting ID to be specified' do
expect(collect_ranges(start: second_batch.first)).to eq([second_batch, third_batch])
end
it 'allows the finishing ID to be specified' do
expect(collect_ranges(finish: second_batch.last)).to eq([first_batch, second_batch])
end
end
end
module GitlabVerifyHelpers
def collect_ranges(args = {})
verifier = described_class.new(args.merge(batch_size: 1))
collect_results(verifier).map { |range, _| range }
end
def collect_failures
verifier = described_class.new(batch_size: 1)
out = {}
collect_results(verifier).map { |_, failures| out.merge!(failures) }
out
end
def collect_results(verifier)
out = []
verifier.run_batches { |*args| out << args }
out
end
end
require 'rake_helper'
describe 'gitlab:lfs rake tasks' do
describe 'check' do
let!(:lfs_object) { create(:lfs_object, :with_file, :correct_oid) }
before do
Rake.application.rake_require('tasks/gitlab/lfs/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:lfs:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
FileUtils.rm_f(lfs_object.file.path)
expect { run_rake_task('gitlab:lfs:check') }.to output(/No such file.*#{Regexp.quote(lfs_object.file.path)}/).to_stdout
end
it 'errors out about invalid checksum' do
File.truncate(lfs_object.file.path, 0)
expect { run_rake_task('gitlab:lfs:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end
require 'rake_helper'
describe 'gitlab:uploads rake tasks' do
describe 'check' do
let!(:upload) { create(:upload, path: Rails.root.join('spec/fixtures/banana_sample.gif')) }
before do
Rake.application.rake_require('tasks/gitlab/uploads/check')
stub_env('VERBOSE' => 'true')
end
it 'outputs the integrity check for each batch' do
expect { run_rake_task('gitlab:uploads:check') }.to output(/Failures: 0/).to_stdout
end
it 'errors out about missing files on the file system' do
missing_upload = create(:upload)
expect { run_rake_task('gitlab:uploads:check') }.to output(/No such file.*#{Regexp.quote(missing_upload.absolute_path)}/).to_stdout
end
it 'errors out about invalid checksum' do
upload.update_column(:checksum, '01a3156db2cf4f67ec823680b40b7302f89ab39179124ad219f94919b8a1769e')
expect { run_rake_task('gitlab:uploads:check') }.to output(/Checksum mismatch/).to_stdout
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment