Commit 248debde authored by Bob Van Landuyt's avatar Bob Van Landuyt

Implement backoff for the circuitbreaker

The circuitbreaker now has 2 failure modes:

- Backing off: This will raise the `Gitlab::Git::Storage::Failing`
  exception. Access to the shard is blocked temporarily.
- Circuit broken: This will raise the
  `Gitlab::Git::Storage::CircuitBroken` exception. Access to the shard
  will be blocked until the failures are reset.
parent c1f4d7d6
......@@ -16,17 +16,16 @@ module StorageHealthHelper
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
elsif circuit_breaker.circuit_broken?
elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
......
......@@ -12,6 +12,7 @@ module Gitlab
CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
......
......@@ -64,12 +64,20 @@ module Gitlab
def circuit_broken?
return false if no_failures?
failure_count > failure_count_threshold
end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > failure_count_threshold
too_many_failures = failure_count > backoff_threshold
recent_failure || too_many_failures
recent_failure && too_many_failures
end
private
def failure_info
@failure_info ||= get_failure_info
end
......@@ -94,7 +102,11 @@ module Gitlab
def check_storage_accessible!
if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end
unless storage_available?
......@@ -131,12 +143,6 @@ module Gitlab
end
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
......@@ -146,6 +152,10 @@ module Gitlab
FailureInfo.new(last_failure, failure_count.to_i)
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end
end
end
......
......@@ -18,6 +18,14 @@ module Gitlab
application_settings.circuitbreaker_storage_timeout
end
def access_retries
application_settings.circuitbreaker_access_retries
end
def backoff_threshold
application_settings.circuitbreaker_backoff_threshold
end
private
def application_settings
......
......@@ -25,6 +25,10 @@ module Gitlab
!!@error
end
def backing_off?
false
end
def last_failure
circuit_broken? ? Time.now : nil
end
......
......@@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
stub_application_setting(circuitbreaker_failure_count_threshold: 0,
circuitbreaker_failure_wait_time: 1,
circuitbreaker_failure_reset_time: 2,
circuitbreaker_storage_timeout: 3)
circuitbreaker_storage_timeout: 3,
circuitbreaker_access_retries: 4,
circuitbreaker_backoff_threshold: 5)
end
describe '#failure_count_threshold' do
......@@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.storage_timeout).to eq(3)
end
end
describe '#access_retries' do
it 'reads the value from settings' do
expect(circuit_breaker.access_retries).to eq(4)
end
end
describe '#backoff_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.backoff_threshold).to eq(5)
end
end
end
describe '#perform' do
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
it 'raises the correct exception when the circuit is open' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 999)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error(Gitlab::Git::Storage::CircuitOpen)
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(1800)
end
end
it 'raises the correct exception when backing off' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
expect(exception.retry_after).to eq(30)
end
end
end
it 'yields the block' do
......@@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!)
.and_call_original
circuit_breaker.perform { 'hello world' }
end
......@@ -137,201 +169,102 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
.to raise_error(Rugged::OSError)
end
context 'with the feature disabled' do
it 'returns the block without checking accessibility' do
stub_feature_flags(git_storage_circuit_breaker: false)
expect(circuit_breaker).not_to receive(:circuit_broken?)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
end
end
describe '#circuit_broken?' do
it 'is working when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
it 'is broken when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
end
it 'is broken when there are too many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
context 'the `failure_wait_time` is set to 0' do
before do
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end
it 'tracks that the storage was accessible' do
set_in_redis(:failure_count, 10)
set_in_redis(:last_failure, Time.now.to_f)
it 'is working even when there is a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 0.seconds.ago.to_f)
set_in_redis(:failure_count, 1)
circuit_breaker.perform { '' }
expect(circuit_breaker.circuit_broken?).to be_falsey
end
end
expect(value_from_redis(:failure_count).to_i).to eq(0)
expect(value_from_redis(:last_failure)).to be_empty
expect(circuit_breaker.failure_count).to eq(0)
expect(circuit_breaker.last_failure).to be_nil
end
end
describe "storage_available?" do
context 'the storage is available' do
it 'tracks that the storage was accessible an raises the error' do
expect(circuit_breaker).to receive(:track_storage_accessible)
circuit_breaker.storage_available?
end
it 'only accessibility check once' do
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).once.and_call_original
it 'only performs the check once' do
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).once.and_call_original
2.times { circuit_breaker.storage_available? }
end
2.times { circuit_breaker.perform { '' } }
end
context 'storage is not available' do
let(:storage_name) { 'broken' }
it 'tracks that the storage was inaccessible' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
context 'with the feature disabled' do
it 'returns the block without checking accessibility' do
stub_feature_flags(git_storage_circuit_breaker: false)
circuit_breaker.storage_available?
end
end
end
expect(circuit_breaker).not_to receive(:circuit_broken?)
describe '#check_storage_accessible!' do
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
result = circuit_breaker.perform { 'hello' }
expect { circuit_breaker.check_storage_accessible! }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(30)
expect(result).to eq('hello')
end
end
context 'the storage is not available' do
let(:storage_name) { 'broken' }
it 'raises an error' do
it 'raises the correct exception' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
expect { circuit_breaker.check_storage_accessible! }
expect { circuit_breaker.perform { '' } }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
expect(exception.retry_after).to eq(30)
end
end
end
end
describe '#track_storage_inaccessible' do
around do |example|
Timecop.freeze { example.run }
end
it 'records the failure time in redis' do
circuit_breaker.track_storage_inaccessible
failure_time = value_from_redis(:last_failure)
expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
end
it 'sets the failure time on the breaker without reloading' do
circuit_breaker.track_storage_inaccessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to eq(Time.now)
end
it 'increments the failure count in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(value_from_redis(:failure_count).to_i).to be(11)
end
it 'increments the failure count on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
it 'tracks that the storage was inaccessible' do
Timecop.freeze do
expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(11)
expect(value_from_redis(:failure_count).to_i).to eq(1)
expect(value_from_redis(:last_failure)).not_to be_empty
expect(circuit_breaker.failure_count).to eq(1)
expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
end
end
end
end
describe '#track_storage_accessible' do
it 'sets the failure count to zero in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_accessible
expect(value_from_redis(:failure_count).to_i).to be(0)
end
it 'sets the failure count to zero on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_accessible
describe '#circuit_broken?' do
it 'is working when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(0)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
it 'removes the last failure time from redis' do
set_in_redis(:last_failure, Time.now.to_i)
circuit_breaker.track_storage_accessible
it 'is broken when there are too many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to be_nil
expect(circuit_breaker.circuit_broken?).to be_truthy
end
end
it 'removes the last failure time from the breaker without reloading' do
set_in_redis(:last_failure, Time.now.to_i)
circuit_breaker.track_storage_accessible
describe '#backing_off?' do
it 'is true when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
expect(value_from_redis(:last_failure)).to be_empty
expect(circuit_breaker.backing_off?).to be_truthy
end
end
it 'wont connect to redis when there are no failures' do
expect(Gitlab::Git::Storage.redis).to receive(:with).once
.and_call_original
expect(circuit_breaker).to receive(:track_storage_accessible)
.and_call_original
circuit_breaker.track_storage_accessible
end
end
context 'the `failure_wait_time` is set to 0' do
before do
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end
describe '#no_failures?' do
it 'is false when a failure was tracked' do
set_in_redis(:last_failure, Time.now.to_i)
set_in_redis(:failure_count, 1)
it 'is working even when there are failures' do
Timecop.freeze do
set_in_redis(:last_failure, 0.seconds.ago.to_f)
set_in_redis(:failure_count, 90)
expect(circuit_breaker.no_failures?).to be_falsey
expect(circuit_breaker.backing_off?).to be_falsey
end
end
end
end
......@@ -351,10 +284,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.failure_count).to eq(7)
end
end
describe '#cache_key' do
it 'includes storage and host' do
expect(circuit_breaker.cache_key).to eq(cache_key)
end
end
end
......@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
# These methods are not part of the public API, but are public to allow the
# CircuitBreaker specs to operate. They should be made private over time.
exceptions = %i[
cache_key
check_storage_accessible!
no_failures?
storage_available?
track_storage_accessible
track_storage_inaccessible
]
expect(theirs - ours).to contain_exactly(*exceptions)
expect(theirs - ours).to be_empty
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment