Commit 21fd3c4d authored by Douwe Maan's avatar Douwe Maan

Merge branch 'tc-repo-verify-mails-ee' into 'master'

Repository verification on Geo secondary

Closes #5564 and #4746

See merge request gitlab-org/gitlab-ee!5550
parents 88e5f4e9 a224f6f4
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
Enable Repository Checks Enable Repository Checks
.help-block .help-block
GitLab will periodically run GitLab will periodically run
%a{ href: 'https://www.kernel.org/pub/software/scm/git/docs/git-fsck.html', target: 'blank' } 'git fsck' %a{ href: 'https://git-scm.com/docs/git-fsck', target: 'blank' } 'git fsck'
in all project and wiki repositories to look for silent disk corruption issues. in all project and wiki repositories to look for silent disk corruption issues.
.form-group .form-group
.col-sm-offset-2.col-sm-10 .col-sm-offset-2.col-sm-10
......
...@@ -3,6 +3,12 @@ class AdminEmailWorker ...@@ -3,6 +3,12 @@ class AdminEmailWorker
include CronjobQueue include CronjobQueue
def perform def perform
send_repository_check_mail if Gitlab::CurrentSettings.repository_checks_enabled
end
private
def send_repository_check_mail
repository_check_failed_count = Project.where(last_repository_check_failed: true).count repository_check_failed_count = Project.where(last_repository_check_failed: true).count
return if repository_check_failed_count.zero? return if repository_check_failed_count.zero?
......
module RepositoryCheck module RepositoryCheck
class BatchWorker class BatchWorker
prepend ::EE::RepositoryCheck::BatchWorker
include ApplicationWorker include ApplicationWorker
include CronjobQueue include CronjobQueue
RUN_TIME = 3600 RUN_TIME = 3600
BATCH_SIZE = 10_000
def perform def perform
return unless Gitlab::CurrentSettings.repository_checks_enabled
start = Time.now start = Time.now
# This loop will break after a little more than one hour ('a little # This loop will break after a little more than one hour ('a little
...@@ -15,7 +20,6 @@ module RepositoryCheck ...@@ -15,7 +20,6 @@ module RepositoryCheck
# check, only one (or two) will be checked at a time. # check, only one (or two) will be checked at a time.
project_ids.each do |project_id| project_ids.each do |project_id|
break if Time.now - start >= RUN_TIME break if Time.now - start >= RUN_TIME
break unless current_settings.repository_checks_enabled
next unless try_obtain_lease(project_id) next unless try_obtain_lease(project_id)
...@@ -31,12 +35,20 @@ module RepositoryCheck ...@@ -31,12 +35,20 @@ module RepositoryCheck
# getting ID's from Postgres is not terribly slow, and because no user # getting ID's from Postgres is not terribly slow, and because no user
# has to sit and wait for this query to finish. # has to sit and wait for this query to finish.
def project_ids def project_ids
limit = 10_000 never_checked_project_ids(BATCH_SIZE) + old_checked_project_ids(BATCH_SIZE)
never_checked_projects = Project.where('last_repository_check_at IS NULL AND created_at < ?', 24.hours.ago) end
.limit(limit).pluck(:id)
old_check_projects = Project.where('last_repository_check_at < ?', 1.month.ago) def never_checked_project_ids(batch_size)
.reorder('last_repository_check_at ASC').limit(limit).pluck(:id) Project.where(last_repository_check_at: nil)
never_checked_projects + old_check_projects .where('created_at < ?', 24.hours.ago)
.limit(batch_size).pluck(:id)
end
def old_checked_project_ids(batch_size)
Project.where.not(last_repository_check_at: nil)
.where('last_repository_check_at < ?', 1.month.ago)
.reorder(last_repository_check_at: :asc)
.limit(batch_size).pluck(:id)
end end
def try_obtain_lease(id) def try_obtain_lease(id)
...@@ -47,16 +59,5 @@ module RepositoryCheck ...@@ -47,16 +59,5 @@ module RepositoryCheck
timeout: 24.hours timeout: 24.hours
).try_obtain ).try_obtain
end end
def current_settings
# No caching of the settings! If we cache them and an admin disables
# this feature, an active RepositoryCheckWorker would keep going for up
# to 1 hour after the feature was disabled.
if Rails.env.test?
Gitlab::CurrentSettings.fake_application_settings
else
ApplicationSetting.current
end
end
end end
end end
...@@ -3,29 +3,38 @@ module RepositoryCheck ...@@ -3,29 +3,38 @@ module RepositoryCheck
include ApplicationWorker include ApplicationWorker
include RepositoryCheckQueue include RepositoryCheckQueue
prepend ::EE::RepositoryCheck::SingleRepositoryWorker
def perform(project_id) def perform(project_id)
project = Project.find(project_id) project = Project.find(project_id)
healthy = project_healthy?(project)
update_repository_check_status(project, healthy)
end
private
def update_repository_check_status(project, healthy)
project.update_columns( project.update_columns(
last_repository_check_failed: !check(project), last_repository_check_failed: !healthy,
last_repository_check_at: Time.now last_repository_check_at: Time.now
) )
end end
private def project_healthy?(project)
repo_healthy?(project) && wiki_repo_healthy?(project)
end
def check(project) def repo_healthy?(project)
if has_pushes?(project) && !git_fsck(project.repository) return true unless has_changes?(project)
false
elsif project.wiki_enabled?
# Historically some projects never had their wiki repos initialized;
# this happens on project creation now. Let's initialize an empty repo
# if it is not already there.
project.create_wiki
git_fsck(project.wiki.repository) git_fsck(project.repository)
else
true
end end
def wiki_repo_healthy?(project)
return true unless has_wiki_changes?(project)
git_fsck(project.wiki.repository)
end end
def git_fsck(repository) def git_fsck(repository)
...@@ -39,8 +48,19 @@ module RepositoryCheck ...@@ -39,8 +48,19 @@ module RepositoryCheck
false false
end end
def has_pushes?(project) def has_changes?(project)
Project.with_push.exists?(project.id) Project.with_push.exists?(project.id)
end end
def has_wiki_changes?(project)
return false unless project.wiki_enabled?
# Historically some projects never had their wiki repos initialized;
# this happens on project creation now. Let's initialize an empty repo
# if it is not already there.
return false unless project.create_wiki
has_changes?(project)
end
end end
end end
...@@ -14,11 +14,11 @@ checks failed you can see their output on the admin log page under ...@@ -14,11 +14,11 @@ checks failed you can see their output on the admin log page under
## Periodic checks ## Periodic checks
When enabled, GitLab periodically runs a repository check on all project When enabled, GitLab periodically runs a repository check on all project
repositories and wiki repositories in order to detect data corruption problems. repositories and wiki repositories in order to detect data corruption.
A project will be checked no more than once per month. If any projects A project will be checked no more than once per month. If any projects
fail their repository checks all GitLab administrators will receive an email fail their repository checks all GitLab administrators will receive an email
notification of the situation. This notification is sent out once a week on notification of the situation. This notification is sent out once a week,
Sunday, by default. by default, midnight at the start of Sunday.
## Disabling periodic checks ## Disabling periodic checks
...@@ -28,16 +28,18 @@ panel. ...@@ -28,16 +28,18 @@ panel.
## What to do if a check failed ## What to do if a check failed
If the repository check fails for some repository you should look up the error If the repository check fails for some repository you should look up the error
in repocheck.log (in the admin panel or on disk; see in `repocheck.log`:
`/var/log/gitlab/gitlab-rails` for Omnibus installations or
`/home/git/gitlab/log` for installations from source). Once you have - in the [admin panel](logs.md#repocheck.log)
resolved the issue use the admin panel to trigger a new repository check on - or on disk, see:
the project. This will clear the 'check failed' state. - `/var/log/gitlab/gitlab-rails` for Omnibus installations
- `/home/git/gitlab/log` for installations from source
If for some reason the periodic repository check caused a lot of false If for some reason the periodic repository check caused a lot of false
alarms you can choose to clear ALL repository check states from the alarms you can choose to clear *all* repository check states by
'Settings' page of the admin panel. clicking "Clear all repository checks" on the **Settings** page of the
admin panel (`/admin/application_settings`).
--- ---
[ce-3232]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/3232 "Auto git fsck" [ce-3232]: https://gitlab.com/gitlab-org/gitlab-ce/merge_requests/3232 "Auto git fsck"
[git-fsck]: https://www.kernel.org/pub/software/scm/git/docs/git-fsck.html "git fsck documentation" [git-fsck]: https://git-scm.com/docs/git-fsck "git fsck documentation"
module EE
module RepositoryCheck
module BatchWorker
extend ActiveSupport::Concern
extend ::Gitlab::Utils::Override
private
override :never_checked_project_ids
def never_checked_project_ids(batch_size)
return super unless ::Gitlab::Geo.secondary?
Geo::ProjectRegistry.synced_repos.synced_wikis
.where(last_repository_check_at: nil)
.where('last_repository_synced_at < ?', 24.hours.ago)
.where('last_wiki_synced_at < ?', 24.hours.ago)
.limit(batch_size).pluck(:project_id)
end
override :old_checked_project_ids
def old_checked_project_ids(batch_size)
return super unless ::Gitlab::Geo.secondary?
Geo::ProjectRegistry.synced_repos.synced_wikis
.where('last_repository_check_at < ?', 1.month.ago)
.reorder(last_repository_check_at: :asc)
.limit(batch_size).pluck(:project_id)
end
end
end
end
module EE
module RepositoryCheck
module SingleRepositoryWorker
extend ActiveSupport::Concern
extend ::Gitlab::Utils::Override
private
override :update_repository_check_status
def update_repository_check_status(project, healthy)
return super unless ::Gitlab::Geo.secondary?
project_registry = ::Geo::ProjectRegistry.find_or_initialize_by(project: project)
project_registry.assign_attributes(
last_repository_check_failed: !healthy,
last_repository_check_at: Time.zone.now
)
project_registry.save!
end
end
end
end
---
title: Run repository verification on Geo secondary
merge_request: 5550
author:
type: added
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class AddRepositoryCheckToGeoProjectRegistry < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :project_registry, :last_repository_check_failed, :boolean
add_column :project_registry, :last_repository_check_at, :datetime_with_timezone
end
end
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180419192603) do ActiveRecord::Schema.define(version: 20180427114641) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "plpgsql" enable_extension "plpgsql"
...@@ -74,6 +74,8 @@ ActiveRecord::Schema.define(version: 20180419192603) do ...@@ -74,6 +74,8 @@ ActiveRecord::Schema.define(version: 20180419192603) do
t.binary "wiki_verification_checksum_sha" t.binary "wiki_verification_checksum_sha"
t.boolean "repository_checksum_mismatch", default: false, null: false t.boolean "repository_checksum_mismatch", default: false, null: false
t.boolean "wiki_checksum_mismatch", default: false, null: false t.boolean "wiki_checksum_mismatch", default: false, null: false
t.boolean "last_repository_check_failed"
t.datetime "last_repository_check_at"
end end
add_index "project_registry", ["last_repository_successful_sync_at"], name: "index_project_registry_on_last_repository_successful_sync_at", using: :btree add_index "project_registry", ["last_repository_successful_sync_at"], name: "index_project_registry_on_last_repository_successful_sync_at", using: :btree
......
require 'spec_helper'
describe EE::RepositoryCheck::BatchWorker do
include ::EE::GeoHelpers
subject(:worker) { RepositoryCheck::BatchWorker.new }
context 'Geo primary' do
set(:primary) { create(:geo_node, :primary) }
before do
stub_current_geo_node(primary)
end
it 'loads project ids from main database' do
projects = create_list(:project, 3, created_at: 1.week.ago)
expect(worker.perform).to eq(projects.map(&:id))
end
end
context 'Geo secondary' do
set(:secondary) { create(:geo_node) }
before do
stub_current_geo_node(secondary)
end
it 'loads project ids from tracking database' do
project_registries = create_list(:geo_project_registry, 3, :synced)
expect(worker.perform).to eq(project_registries.map(&:project_id))
end
it 'loads project ids that were checked more than a month ago from tracking database' do
project_registries = create_list(:geo_project_registry, 3, :synced,
last_repository_check_failed: false,
last_repository_check_at: 42.days.ago)
expect(worker.perform).to eq(project_registries.map(&:project_id))
end
end
end
require 'spec_helper'
describe EE::RepositoryCheck::SingleRepositoryWorker do
include ::EE::GeoHelpers
set(:project) { create(:project) }
subject(:worker) { RepositoryCheck::SingleRepositoryWorker.new }
context 'Geo primary' do
set(:primary) { create(:geo_node, :primary) }
before do
stub_current_geo_node(primary)
end
it 'saves results to main database' do
expect do
worker.perform(project.id)
end.to change { project.reload.last_repository_check_at }
expect(project.last_repository_check_failed).to be_falsy
end
end
context 'Geo secondary' do
set(:project_registry) { create(:geo_project_registry, project: project) }
set(:secondary) { create(:geo_node) }
before do
stub_current_geo_node(secondary)
end
it 'saves results to Geo registry' do
expect do
worker.perform(project.id)
end.to change { project_registry.reload.last_repository_check_at }
expect(project_registry.last_repository_check_failed).to be_falsy
end
it 'creates Geo registry when not yet exists' do
project_registry.destroy!
worker.perform(project.id)
expect(Geo::ProjectRegistry.find_by!(project: project.id).last_repository_check_failed).to be_falsy
end
end
end
require 'spec_helper'
describe AdminEmailWorker do
subject(:worker) { described_class.new }
describe '.perform' do
it 'does not attempt to send repository check mail when they are disabled' do
stub_application_setting(repository_checks_enabled: false)
expect(worker).not_to receive(:send_repository_check_mail)
worker.perform
end
context 'repository_checks enabled' do
before do
stub_application_setting(repository_checks_enabled: true)
end
it 'checks if repository check mail should be sent' do
expect(worker).to receive(:send_repository_check_mail)
worker.perform
end
it 'does not send mail when there are no failed repos' do
expect(RepositoryCheckMailer).not_to receive(:notify)
worker.perform
end
it 'send mail when there is a failed repo' do
create(:project, last_repository_check_failed: true, last_repository_check_at: Date.yesterday)
expect(RepositoryCheckMailer).to receive(:notify).and_return(spy)
worker.perform
end
end
end
end
...@@ -31,8 +31,8 @@ describe RepositoryCheck::BatchWorker do ...@@ -31,8 +31,8 @@ describe RepositoryCheck::BatchWorker do
it 'does nothing when repository checks are disabled' do it 'does nothing when repository checks are disabled' do
create(:project, created_at: 1.week.ago) create(:project, created_at: 1.week.ago)
current_settings = double('settings', repository_checks_enabled: false)
expect(subject).to receive(:current_settings) { current_settings } stub_application_setting(repository_checks_enabled: false)
expect(subject.perform).to eq(nil) expect(subject.perform).to eq(nil)
end end
......
...@@ -2,44 +2,60 @@ require 'spec_helper' ...@@ -2,44 +2,60 @@ require 'spec_helper'
require 'fileutils' require 'fileutils'
describe RepositoryCheck::SingleRepositoryWorker do describe RepositoryCheck::SingleRepositoryWorker do
subject { described_class.new } subject(:worker) { described_class.new }
it 'passes when the project has no push events' do it 'skips when the project has no push events' do
project = create(:project_empty_repo, :wiki_disabled) project = create(:project, :repository, :wiki_disabled)
project.events.destroy_all project.events.destroy_all
break_repo(project) break_project(project)
subject.perform(project.id) expect(worker).not_to receive(:git_fsck)
worker.perform(project.id)
expect(project.reload.last_repository_check_failed).to eq(false) expect(project.reload.last_repository_check_failed).to eq(false)
end end
it 'fails when the project has push events and a broken repository' do it 'fails when the project has push events and a broken repository' do
project = create(:project_empty_repo) project = create(:project, :repository)
create_push_event(project) create_push_event(project)
break_repo(project) break_project(project)
subject.perform(project.id) worker.perform(project.id)
expect(project.reload.last_repository_check_failed).to eq(true) expect(project.reload.last_repository_check_failed).to eq(true)
end end
it 'succeeds when the project repo is valid' do
project = create(:project, :repository, :wiki_disabled)
create_push_event(project)
expect(worker).to receive(:git_fsck).and_call_original
expect do
worker.perform(project.id)
end.to change { project.reload.last_repository_check_at }
expect(project.reload.last_repository_check_failed).to eq(false)
end
it 'fails if the wiki repository is broken' do it 'fails if the wiki repository is broken' do
project = create(:project_empty_repo, :wiki_enabled) project = create(:project, :repository, :wiki_enabled)
project.create_wiki project.create_wiki
create_push_event(project)
# Test sanity: everything should be fine before the wiki repo is broken # Test sanity: everything should be fine before the wiki repo is broken
subject.perform(project.id) worker.perform(project.id)
expect(project.reload.last_repository_check_failed).to eq(false) expect(project.reload.last_repository_check_failed).to eq(false)
break_wiki(project) break_wiki(project)
subject.perform(project.id) worker.perform(project.id)
expect(project.reload.last_repository_check_failed).to eq(true) expect(project.reload.last_repository_check_failed).to eq(true)
end end
it 'skips wikis when disabled' do it 'skips wikis when disabled' do
project = create(:project_empty_repo, :wiki_disabled) project = create(:project, :wiki_disabled)
# Make sure the test would fail if the wiki repo was checked # Make sure the test would fail if the wiki repo was checked
break_wiki(project) break_wiki(project)
...@@ -49,8 +65,8 @@ describe RepositoryCheck::SingleRepositoryWorker do ...@@ -49,8 +65,8 @@ describe RepositoryCheck::SingleRepositoryWorker do
end end
it 'creates missing wikis' do it 'creates missing wikis' do
project = create(:project_empty_repo, :wiki_enabled) project = create(:project, :wiki_enabled)
FileUtils.rm_rf(wiki_path(project)) Gitlab::Shell.new.rm_directory(project.repository_storage, project.wiki.path)
subject.perform(project.id) subject.perform(project.id)
...@@ -58,34 +74,39 @@ describe RepositoryCheck::SingleRepositoryWorker do ...@@ -58,34 +74,39 @@ describe RepositoryCheck::SingleRepositoryWorker do
end end
it 'does not create a wiki if the main repo does not exist at all' do it 'does not create a wiki if the main repo does not exist at all' do
project = create(:project_empty_repo) project = create(:project, :repository)
create_push_event(project) Gitlab::Shell.new.rm_directory(project.repository_storage, project.path)
FileUtils.rm_rf(project.repository.path_to_repo) Gitlab::Shell.new.rm_directory(project.repository_storage, project.wiki.path)
FileUtils.rm_rf(wiki_path(project))
subject.perform(project.id) subject.perform(project.id)
expect(File.exist?(wiki_path(project))).to eq(false) expect(Gitlab::Shell.new.exists?(project.repository_storage, project.wiki.path)).to eq(false)
end end
def break_wiki(project) def create_push_event(project)
objects_dir = wiki_path(project) + '/objects' project.events.create(action: Event::PUSHED, author_id: create(:user).id)
end
# Replace the /objects directory with a file so that the repo is def break_wiki(project)
# invalid, _and_ 'git init' cannot fix it. break_repo(wiki_path(project))
FileUtils.rm_rf(objects_dir)
FileUtils.touch(objects_dir) if File.directory?(wiki_path(project))
end end
def wiki_path(project) def wiki_path(project)
project.wiki.repository.path_to_repo project.wiki.repository.path_to_repo
end end
def create_push_event(project) def break_project(project)
project.events.create(action: Event::PUSHED, author_id: create(:user).id) break_repo(project.repository.path_to_repo)
end end
def break_repo(project) def break_repo(repo)
FileUtils.rm_rf(File.join(project.repository.path_to_repo, 'objects')) # Create or replace blob ffffffffffffffffffffffffffffffffffffffff with an empty file
# This will make the repo invalid, _and_ 'git init' cannot fix it.
path = File.join(repo, 'objects', 'ff')
file = File.join(path, 'ffffffffffffffffffffffffffffffffffffff')
FileUtils.mkdir_p(path)
FileUtils.rm_f(file)
FileUtils.touch(file)
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment