Commit 40c38130 authored by Stan Hu's avatar Stan Hu

Merge branch '6698-geo-retry-checksum-calculation-for-failures' into 'master'

Resolve "Geo - Retry checksum calculation for failures on the primary node"

Closes #6698

See merge request gitlab-org/gitlab-ee!6295
parents 953a5d32 fbd1bfe7
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180702114215) do
ActiveRecord::Schema.define(version: 20180702181530) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -2048,6 +2048,10 @@ ActiveRecord::Schema.define(version: 20180702114215) do
t.binary "wiki_verification_checksum"
t.string "last_repository_verification_failure"
t.string "last_wiki_verification_failure"
t.datetime_with_timezone "repository_retry_at"
t.datetime_with_timezone "wiki_retry_at"
t.integer "repository_retry_count"
t.integer "wiki_retry_count"
end
add_index "project_repository_states", ["last_repository_verification_failure"], name: "idx_repository_states_on_repository_failure_partial", where: "(last_repository_verification_failure IS NOT NULL)", using: :btree
......
......@@ -4,6 +4,24 @@ module Geo
@shard_name = shard_name
end
def find_failed_repositories(batch_size:)
query = build_query_to_find_failed_projects(type: :repository, batch_size: batch_size)
cte = Gitlab::SQL::CTE.new(:failed_repositories, query)
Project.with(cte.to_arel)
.from(cte.alias_to(projects_table))
.order("projects.repository_retry_at ASC")
end
def find_failed_wikis(batch_size:)
query = build_query_to_find_failed_projects(type: :wiki, batch_size: batch_size)
cte = Gitlab::SQL::CTE.new(:failed_wikis, query)
Project.with(cte.to_arel)
.from(cte.alias_to(projects_table))
.order("projects.wiki_retry_at ASC")
end
def find_outdated_projects(batch_size:)
query = build_query_to_find_outdated_projects(batch_size: batch_size)
cte = Gitlab::SQL::CTE.new(:outdated_projects, query)
......@@ -45,6 +63,18 @@ module Geo
attr_reader :shard_name
def build_query_to_find_failed_projects(type:, batch_size:)
query =
projects_table
.join(repository_state_table).on(project_id_matcher)
.project(projects_table[:id], repository_state_table["#{type}_retry_at"])
.where(repository_state_table["last_#{type}_verification_failure"].not_eq(nil))
.take(batch_size)
query = apply_shard_restriction(query) if shard_name.present?
query
end
def build_query_to_find_outdated_projects(batch_size:)
query =
projects_table
......
......@@ -36,18 +36,35 @@ module Geo
def reset_repository_checksum!
return if repository_state.nil?
repository_state.update!("#{repository_checksum_column}" => nil, "#{repository_failure_column}" => nil)
repository_state.update!(
"#{repository_checksum_column}" => nil,
"#{repository_failure_column}" => nil,
"#{repository_retry_at_column}" => nil,
"#{repository_retry_count_column}" => nil
)
rescue => e
log_error('Cannot reset repository checksum', e)
raise RepositoryUpdateError, "Cannot reset repository checksum: #{e}"
end
def repository_checksum_column
"#{Geo::RepositoryUpdatedEvent.sources.key(source)}_verification_checksum"
"#{repository_type}_verification_checksum"
end
def repository_failure_column
"last_#{Geo::RepositoryUpdatedEvent.sources.key(source)}_verification_failure"
"last_#{repository_type}_verification_failure"
end
def repository_retry_at_column
"#{repository_type}_retry_at"
end
def repository_retry_count_column
"#{repository_type}_retry_count"
end
def repository_type
@repository_type ||= Geo::RepositoryUpdatedEvent.sources.key(source)
end
end
end
module Geo
class RepositoryVerificationPrimaryService
include Delay
include Gitlab::Geo::ProjectLogHelpers
def initialize(project)
......@@ -33,12 +34,29 @@ module Geo
end
def update_repository_state!(type, checksum: nil, failure: nil)
retry_at, retry_count =
if failure.present?
retry_count = repository_state.public_send("#{type}_retry_count").to_i + 1 # rubocop:disable GitlabSecurity/PublicSend
[next_retry_time(retry_count), retry_count]
end
repository_state.update!(
"#{type}_verification_checksum" => checksum,
"last_#{type}_verification_failure" => failure
"last_#{type}_verification_failure" => failure,
"#{type}_retry_at" => retry_at,
"#{type}_retry_count" => retry_count
)
end
# To prevent the retry time from storing invalid dates in the database,
# cap the max time to a week plus some random jitter value.
def next_retry_time(retry_count)
proposed_time = Time.now + delay(retry_count).seconds
max_future_time = Time.now + 7.days + delay(1).seconds
[proposed_time, max_future_time].min
end
def repository_state
@repository_state ||= project.repository_state || project.build_repository_state
end
......
......@@ -42,12 +42,13 @@ module Geo
def load_pending_resources
resources = find_unverified_project_ids(batch_size: db_retrieve_batch_size)
remaining_capacity = db_retrieve_batch_size - resources.size
return resources if remaining_capacity.zero?
if remaining_capacity.zero?
resources
else
resources + find_outdated_project_ids(batch_size: remaining_capacity)
end
resources += find_outdated_project_ids(batch_size: remaining_capacity)
remaining_capacity = db_retrieve_batch_size - resources.size
return resources if remaining_capacity.zero?
resources + find_failed_project_ids(batch_size: remaining_capacity)
end
def find_unverified_project_ids(batch_size:)
......@@ -57,6 +58,21 @@ module Geo
def find_outdated_project_ids(batch_size:)
finder.find_outdated_projects(batch_size: batch_size).pluck(:id)
end
def find_failed_project_ids(batch_size:)
repositories_ids = find_failed_repositories_ids(batch_size: batch_size)
wiki_ids = find_failed_wiki_ids(batch_size: batch_size)
take_batch(repositories_ids, wiki_ids, batch_size: batch_size)
end
def find_failed_repositories_ids(batch_size:)
finder.find_failed_repositories(batch_size: batch_size).pluck(:id)
end
def find_failed_wiki_ids(batch_size:)
finder.find_failed_wikis(batch_size: batch_size).pluck(:id)
end
end
end
end
......
---
title: Geo - Retry checksum calculation for failures on the primary node
merge_request: 6295
author:
type: changed
class AddRetryFieldsToProjectRepositoryStates < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :project_repository_states, :repository_retry_at, :datetime_with_timezone
add_column :project_repository_states, :wiki_retry_at, :datetime_with_timezone
add_column :project_repository_states, :repository_retry_count, :integer
add_column :project_repository_states, :wiki_retry_count, :integer
end
end
......@@ -3,6 +3,92 @@ require 'spec_helper'
describe Geo::RepositoryVerificationFinder, :postgresql do
set(:project) { create(:project) }
describe '#find_failed_repositories' do
it 'returns projects where repository verification failed' do
create(:repository_state, :repository_failed, :wiki_verified, project: project)
expect(subject.find_failed_repositories(batch_size: 10))
.to match_array(project)
end
it 'does not return projects where repository verification is outdated' do
create(:repository_state, :repository_outdated, project: project)
expect(subject.find_failed_repositories(batch_size: 10)).to be_empty
end
it 'does not return projects where repository verification is pending' do
create(:repository_state, :wiki_verified, project: project)
expect(subject.find_failed_repositories(batch_size: 10)).to be_empty
end
it 'returns projects ordered by next retry time' do
next_project = create(:project)
create(:repository_state, :repository_failed, repository_retry_at: 1.hour.from_now, project: project)
create(:repository_state, :repository_failed, repository_retry_at: 30.minutes.from_now, project: next_project)
expect(subject.find_failed_repositories(batch_size: 10)).to eq [next_project, project]
end
context 'with shard restriction' do
subject { described_class.new(shard_name: project.repository_storage) }
it 'does not return projects on other shards' do
project_other_shard = create(:project)
project_other_shard.update_column(:repository_storage, 'other')
create(:repository_state, :repository_failed, project: project)
create(:repository_state, :repository_failed, project: project_other_shard)
expect(subject.find_failed_repositories(batch_size: 10))
.to match_array(project)
end
end
end
describe '#find_failed_wikis' do
it 'returns projects where wiki verification failed' do
create(:repository_state, :repository_verified, :wiki_failed, project: project)
expect(subject.find_failed_wikis(batch_size: 10))
.to match_array(project)
end
it 'does not return projects where wiki verification is outdated' do
create(:repository_state, :wiki_outdated, project: project)
expect(subject.find_failed_wikis(batch_size: 10)).to be_empty
end
it 'does not return projects where wiki verification is pending' do
create(:repository_state, :repository_verified, project: project)
expect(subject.find_failed_wikis(batch_size: 10)).to be_empty
end
it 'returns projects ordered by next retry time' do
next_project = create(:project)
create(:repository_state, :wiki_failed, wiki_retry_at: 1.hour.from_now, project: project)
create(:repository_state, :wiki_failed, wiki_retry_at: 30.minutes.from_now, project: next_project)
expect(subject.find_failed_wikis(batch_size: 10)).to eq [next_project, project]
end
context 'with shard restriction' do
subject { described_class.new(shard_name: project.repository_storage) }
it 'does not return projects on other shards' do
project_other_shard = create(:project)
project_other_shard.update_column(:repository_storage, 'other')
create(:repository_state, :wiki_failed, project: project)
create(:repository_state, :wiki_failed, project: project_other_shard)
expect(subject.find_failed_wikis(batch_size: 10))
.to match_array(project)
end
end
end
describe '#find_outdated_projects' do
it 'returns projects where repository verification is outdated' do
create(:repository_state, :repository_outdated, project: project)
......
......@@ -48,13 +48,23 @@ describe Geo::RepositoryUpdatedService do
expect { subject.execute }.to change { repository_state.reload.public_send("last_#{method_prefix}_verification_failure") }.to(nil)
end
it 'resets the retry_at column' do
repository_state.update!("#{method_prefix}_retry_at" => 1.hour.from_now)
expect { subject.execute }.to change { repository_state.reload.public_send("#{method_prefix}_retry_at") }.to(nil)
end
it 'resets the retry_count column' do
repository_state.update!("#{method_prefix}_retry_count" => 1)
expect { subject.execute }.to change { repository_state.reload.public_send("#{method_prefix}_retry_count") }.to(nil)
end
it 'does not raise an error when project have never been verified' do
expect { described_class.new(create(:project)) }.not_to raise_error
end
it 'raises a Geo::RepositoryUpdatedService::RepositoryUpdateError when an error occurs' do
allow(subject.repository_state).to receive(:update!)
.with("#{method_prefix}_verification_checksum" => nil, "last_#{method_prefix}_verification_failure" => nil)
.with("#{method_prefix}_verification_checksum" => nil, "last_#{method_prefix}_verification_failure" => nil, "#{method_prefix}_retry_at" => nil, "#{method_prefix}_retry_count" => nil)
.and_raise(ActiveRecord::RecordInvalid.new(repository_state))
expect { subject.execute }.to raise_error Geo::RepositoryUpdatedService::RepositoryUpdateError, /Cannot reset repository checksum/
......
......@@ -18,7 +18,11 @@ describe Geo::RepositoryVerificationPrimaryService do
repository_verification_checksum: 'f123',
last_repository_verification_failure: nil,
wiki_verification_checksum: 'e321',
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
repository_retry_at: nil,
repository_retry_count: nil,
wiki_retry_at: nil,
wiki_retry_count: nil
)
end
......@@ -38,7 +42,11 @@ describe Geo::RepositoryVerificationPrimaryService do
repository_verification_checksum: 'f123',
last_repository_verification_failure: nil,
wiki_verification_checksum: 'e321',
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
repository_retry_at: nil,
repository_retry_count: nil,
wiki_retry_at: nil,
wiki_retry_count: nil
)
end
......@@ -58,7 +66,11 @@ describe Geo::RepositoryVerificationPrimaryService do
repository_verification_checksum: 'f123',
last_repository_verification_failure: nil,
wiki_verification_checksum: 'e321',
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
repository_retry_at: nil,
repository_retry_count: nil,
wiki_retry_at: nil,
wiki_retry_count: nil
)
end
......@@ -89,7 +101,11 @@ describe Geo::RepositoryVerificationPrimaryService do
repository_verification_checksum: 'f123',
last_repository_verification_failure: nil,
wiki_verification_checksum: 'e321',
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
repository_retry_at: nil,
repository_retry_count: nil,
wiki_retry_at: nil,
wiki_retry_count: nil
)
end
......@@ -100,7 +116,11 @@ describe Geo::RepositoryVerificationPrimaryService do
repository_verification_checksum: '0000000000000000000000000000000000000000',
last_repository_verification_failure: nil,
wiki_verification_checksum: '0000000000000000000000000000000000000000',
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
repository_retry_at: nil,
repository_retry_count: nil,
wiki_retry_at: nil,
wiki_retry_count: nil
)
end
......@@ -114,25 +134,58 @@ describe Geo::RepositoryVerificationPrimaryService do
repository_verification_checksum: '0000000000000000000000000000000000000000',
last_repository_verification_failure: nil,
wiki_verification_checksum: '0000000000000000000000000000000000000000',
last_wiki_verification_failure: nil
last_wiki_verification_failure: nil,
repository_retry_at: nil,
repository_retry_count: nil,
wiki_retry_at: nil,
wiki_retry_count: nil
)
end
it 'keeps track of failures when calculating the repository checksum' do
stub_project_repository(project, repository)
stub_wiki_repository(project.wiki, wiki)
context 'when checksum calculation fails' do
before do
stub_project_repository(project, repository)
stub_wiki_repository(project.wiki, wiki)
allow(repository).to receive(:checksum).and_raise('Something went wrong with repository')
allow(wiki).to receive(:checksum).twice.and_raise('Something went wrong with wiki')
allow(repository).to receive(:checksum).and_raise('Something went wrong with repository')
allow(wiki).to receive(:checksum).twice.and_raise('Something went wrong with wiki')
end
subject.execute
it 'keeps track of failures' do
subject.execute
expect(project.repository_state).to have_attributes(
repository_verification_checksum: nil,
last_repository_verification_failure: 'Something went wrong with repository',
wiki_verification_checksum: nil,
last_wiki_verification_failure: 'Something went wrong with wiki'
)
expect(project.repository_state).to have_attributes(
repository_verification_checksum: nil,
last_repository_verification_failure: 'Something went wrong with repository',
wiki_verification_checksum: nil,
last_wiki_verification_failure: 'Something went wrong with wiki',
repository_retry_at: be_present,
repository_retry_count: 1,
wiki_retry_at: be_present,
wiki_retry_count: 1
)
end
it 'ensures the next retry time is capped properly' do
repository_state =
create(:repository_state,
project: project,
repository_retry_count: 30,
wiki_retry_count: 30)
subject.execute
expect(repository_state.reload).to have_attributes(
repository_verification_checksum: nil,
last_repository_verification_failure: 'Something went wrong with repository',
wiki_verification_checksum: nil,
last_wiki_verification_failure: 'Something went wrong with wiki',
repository_retry_at: be_within(100.seconds).of(Time.now + 7.days),
repository_retry_count: 31,
wiki_retry_at: be_within(100.seconds).of(Time.now + 7.days),
wiki_retry_count: 31
)
end
end
end
......
......@@ -63,6 +63,26 @@ describe Geo::RepositoryVerification::Primary::ShardWorker, :postgresql, :clean_
subject.perform(shard_name)
end
it 'performs Geo::RepositoryVerification::Primary::SingleWorker for projects where repository verification failed' do
repository_verification_failed = create(:project)
create(:repository_state, :repository_failed, :wiki_verified, project: repository_verification_failed)
expect(primary_singleworker).to receive(:perform_async).with(repository_verification_failed.id)
subject.perform(shard_name)
end
it 'performs Geo::RepositoryVerification::Primary::SingleWorker for projects where wiki verification failed' do
wiki_verification_failed = create(:project)
create(:repository_state, :repository_verified, :wiki_failed, project: wiki_verification_failed)
expect(primary_singleworker).to receive(:perform_async).with(wiki_verification_failed.id)
subject.perform(shard_name)
end
it 'does not perform Geo::RepositoryVerification::Primary::SingleWorker when shard becomes unhealthy' do
create(:project)
......@@ -133,14 +153,14 @@ describe Geo::RepositoryVerification::Primary::ShardWorker, :postgresql, :clean_
end
end
it 'handles multiple batches of projects needing verification, skipping failed repos' do
it 'handles multiple batches of projects needing verification, including failed repos' do
expect(primary_singleworker).to receive(:perform_async).with(project_repo_unverified.id).once.and_call_original
expect(primary_singleworker).to receive(:perform_async).with(project_wiki_unverified.id).once.and_call_original
expect(primary_singleworker).to receive(:perform_async).with(project_repo_verified.id).once.and_call_original
expect(primary_singleworker).to receive(:perform_async).with(project_wiki_verified.id).once.and_call_original
expect(primary_singleworker).not_to receive(:perform_async).with(project_both_failed.id)
expect(primary_singleworker).not_to receive(:perform_async).with(project_repo_failed_wiki_verified.id)
expect(primary_singleworker).not_to receive(:perform_async).with(project_repo_verified_wiki_failed.id)
expect(primary_singleworker).to receive(:perform_async).with(project_both_failed.id).once.and_call_original
expect(primary_singleworker).to receive(:perform_async).with(project_repo_failed_wiki_verified.id).once.and_call_original
expect(primary_singleworker).to receive(:perform_async).with(project_repo_verified_wiki_failed.id).once.and_call_original
8.times do
Sidekiq::Testing.inline! { subject.perform(shard_name) }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment