Commit 430e7671 authored by Bob Van Landuyt's avatar Bob Van Landuyt

Implement backoff for the circuitbreaker

The circuitbreaker now has 2 failure modes:

- Backing off: This will raise the `Gitlab::Git::Storage::Failing`
  exception. Access to the shard is blocked temporarily.
- Circuit broken: This will raise the
  `Gitlab::Git::Storage::CircuitBroken` exception. Access to the shard
  will be blocked until the failures are reset.
parent 1881d4f8
...@@ -16,17 +16,16 @@ module StorageHealthHelper ...@@ -16,17 +16,16 @@ module StorageHealthHelper
def message_for_circuit_breaker(circuit_breaker) def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures, translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures, maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time } number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\ "retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params "resolved.") % translation_params
elsif circuit_breaker.circuit_broken? elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params "block access for %{number_of_seconds} seconds.") % translation_params
else else
......
...@@ -12,6 +12,7 @@ module Gitlab ...@@ -12,6 +12,7 @@ module Gitlab
CircuitOpen = Class.new(Inaccessible) CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible) Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
......
...@@ -64,12 +64,20 @@ module Gitlab ...@@ -64,12 +64,20 @@ module Gitlab
def circuit_broken? def circuit_broken?
return false if no_failures? return false if no_failures?
failure_count > failure_count_threshold
end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > failure_count_threshold too_many_failures = failure_count > backoff_threshold
recent_failure || too_many_failures recent_failure && too_many_failures
end end
private
def failure_info def failure_info
@failure_info ||= get_failure_info @failure_info ||= get_failure_info
end end
...@@ -94,7 +102,11 @@ module Gitlab ...@@ -94,7 +102,11 @@ module Gitlab
def check_storage_accessible! def check_storage_accessible!
if circuit_broken? if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time) raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end end
unless storage_available? unless storage_available?
...@@ -131,12 +143,6 @@ module Gitlab ...@@ -131,12 +143,6 @@ module Gitlab
end end
end end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
def get_failure_info def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis| last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count) redis.hmget(cache_key, :last_failure, :failure_count)
...@@ -146,6 +152,10 @@ module Gitlab ...@@ -146,6 +152,10 @@ module Gitlab
FailureInfo.new(last_failure, failure_count.to_i) FailureInfo.new(last_failure, failure_count.to_i)
end end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end end
end end
end end
......
...@@ -18,6 +18,14 @@ module Gitlab ...@@ -18,6 +18,14 @@ module Gitlab
application_settings.circuitbreaker_storage_timeout application_settings.circuitbreaker_storage_timeout
end end
def access_retries
application_settings.circuitbreaker_access_retries
end
def backoff_threshold
application_settings.circuitbreaker_backoff_threshold
end
private private
def application_settings def application_settings
......
...@@ -25,6 +25,10 @@ module Gitlab ...@@ -25,6 +25,10 @@ module Gitlab
!!@error !!@error
end end
def backing_off?
false
end
def last_failure def last_failure
circuit_broken? ? Time.now : nil circuit_broken? ? Time.now : nil
end end
......
...@@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -79,7 +79,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
stub_application_setting(circuitbreaker_failure_count_threshold: 0, stub_application_setting(circuitbreaker_failure_count_threshold: 0,
circuitbreaker_failure_wait_time: 1, circuitbreaker_failure_wait_time: 1,
circuitbreaker_failure_reset_time: 2, circuitbreaker_failure_reset_time: 2,
circuitbreaker_storage_timeout: 3) circuitbreaker_storage_timeout: 3,
circuitbreaker_access_retries: 4,
circuitbreaker_backoff_threshold: 5)
end end
describe '#failure_count_threshold' do describe '#failure_count_threshold' do
...@@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -105,14 +107,43 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.storage_timeout).to eq(3) expect(circuit_breaker.storage_timeout).to eq(3)
end end
end end
describe '#access_retries' do
it 'reads the value from settings' do
expect(circuit_breaker.access_retries).to eq(4)
end
end
describe '#backoff_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.backoff_threshold).to eq(5)
end
end
end end
describe '#perform' do describe '#perform' do
it 'raises an exception with retry time when the circuit is open' do it 'raises the correct exception when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true) set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 999)
expect { |b| circuit_breaker.perform(&b) } expect { |b| circuit_breaker.perform(&b) }
.to raise_error(Gitlab::Git::Storage::CircuitOpen) .to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(1800)
end
end
it 'raises the correct exception when backing off' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
expect(exception.retry_after).to eq(30)
end
end
end end
it 'yields the block' do it 'yields the block' do
...@@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -122,6 +153,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
it 'checks if the storage is available' do it 'checks if the storage is available' do
expect(circuit_breaker).to receive(:check_storage_accessible!) expect(circuit_breaker).to receive(:check_storage_accessible!)
.and_call_original
circuit_breaker.perform { 'hello world' } circuit_breaker.perform { 'hello world' }
end end
...@@ -137,201 +169,102 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -137,201 +169,102 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
.to raise_error(Rugged::OSError) .to raise_error(Rugged::OSError)
end end
context 'with the feature disabled' do it 'tracks that the storage was accessible' do
it 'returns the block without checking accessibility' do set_in_redis(:failure_count, 10)
stub_feature_flags(git_storage_circuit_breaker: false) set_in_redis(:last_failure, Time.now.to_f)
expect(circuit_breaker).not_to receive(:circuit_broken?)
result = circuit_breaker.perform { 'hello' }
expect(result).to eq('hello')
end
end
end
describe '#circuit_broken?' do
it 'is working when there is no last failure' do
set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
it 'is broken when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
end
it 'is broken when there are too many failures' do
set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
expect(circuit_breaker.circuit_broken?).to be_truthy
end
context 'the `failure_wait_time` is set to 0' do
before do
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end
it 'is working even when there is a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 0.seconds.ago.to_f)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.circuit_broken?).to be_falsey
end
end
end
end
describe "storage_available?" do circuit_breaker.perform { '' }
context 'the storage is available' do
it 'tracks that the storage was accessible an raises the error' do
expect(circuit_breaker).to receive(:track_storage_accessible)
circuit_breaker.storage_available? expect(value_from_redis(:failure_count).to_i).to eq(0)
expect(value_from_redis(:last_failure)).to be_empty
expect(circuit_breaker.failure_count).to eq(0)
expect(circuit_breaker.last_failure).to be_nil
end end
it 'only performs the check once' do it 'only accessibility check once' do
expect(Gitlab::Git::Storage::ForkedStorageCheck) expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).once.and_call_original .to receive(:storage_available?).once.and_call_original
2.times { circuit_breaker.storage_available? } 2.times { circuit_breaker.perform { '' } }
end
end end
context 'storage is not available' do context 'with the feature disabled' do
let(:storage_name) { 'broken' } it 'returns the block without checking accessibility' do
stub_feature_flags(git_storage_circuit_breaker: false)
it 'tracks that the storage was inaccessible' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
circuit_breaker.storage_available? expect(circuit_breaker).not_to receive(:circuit_broken?)
end
end
end
describe '#check_storage_accessible!' do result = circuit_breaker.perform { 'hello' }
it 'raises an exception with retry time when the circuit is open' do
allow(circuit_breaker).to receive(:circuit_broken?).and_return(true)
expect { circuit_breaker.check_storage_accessible! } expect(result).to eq('hello')
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::CircuitOpen)
expect(exception.retry_after).to eq(30)
end end
end end
context 'the storage is not available' do context 'the storage is not available' do
let(:storage_name) { 'broken' } let(:storage_name) { 'broken' }
it 'raises an error' do it 'raises the correct exception' do
expect(circuit_breaker).to receive(:track_storage_inaccessible) expect(circuit_breaker).to receive(:track_storage_inaccessible)
expect { circuit_breaker.check_storage_accessible! } expect { circuit_breaker.perform { '' } }
.to raise_error do |exception| .to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible) expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
expect(exception.retry_after).to eq(30) expect(exception.retry_after).to eq(30)
end end
end end
end
end
describe '#track_storage_inaccessible' do
around do |example|
Timecop.freeze { example.run }
end
it 'records the failure time in redis' do
circuit_breaker.track_storage_inaccessible
failure_time = value_from_redis(:last_failure)
expect(Time.at(failure_time.to_i)).to be_within(1.second).of(Time.now)
end
it 'sets the failure time on the breaker without reloading' do it 'tracks that the storage was inaccessible' do
circuit_breaker.track_storage_inaccessible Timecop.freeze do
expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
expect(circuit_breaker).not_to receive(:get_failure_info) expect(value_from_redis(:failure_count).to_i).to eq(1)
expect(circuit_breaker.last_failure).to eq(Time.now) expect(value_from_redis(:last_failure)).not_to be_empty
expect(circuit_breaker.failure_count).to eq(1)
expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
end end
it 'increments the failure count in redis' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(value_from_redis(:failure_count).to_i).to be(11)
end end
it 'increments the failure count on the breaker without reloading' do
set_in_redis(:failure_count, 10)
circuit_breaker.track_storage_inaccessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.failure_count).to eq(11)
end end
end end
describe '#track_storage_accessible' do describe '#circuit_broken?' do
it 'sets the failure count to zero in redis' do it 'is working when there is no last failure' do
set_in_redis(:failure_count, 10) set_in_redis(:last_failure, nil)
set_in_redis(:failure_count, 0)
circuit_breaker.track_storage_accessible
expect(value_from_redis(:failure_count).to_i).to be(0) expect(circuit_breaker.circuit_broken?).to be_falsey
end end
it 'sets the failure count to zero on the breaker without reloading' do it 'is broken when there are too many failures' do
set_in_redis(:failure_count, 10) set_in_redis(:last_failure, 1.day.ago.to_f)
set_in_redis(:failure_count, 200)
circuit_breaker.track_storage_accessible
expect(circuit_breaker).not_to receive(:get_failure_info) expect(circuit_breaker.circuit_broken?).to be_truthy
expect(circuit_breaker.failure_count).to eq(0)
end end
it 'removes the last failure time from redis' do
set_in_redis(:last_failure, Time.now.to_i)
circuit_breaker.track_storage_accessible
expect(circuit_breaker).not_to receive(:get_failure_info)
expect(circuit_breaker.last_failure).to be_nil
end end
it 'removes the last failure time from the breaker without reloading' do describe '#backing_off?' do
set_in_redis(:last_failure, Time.now.to_i) it 'is true when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
circuit_breaker.track_storage_accessible expect(circuit_breaker.backing_off?).to be_truthy
end
end
expect(value_from_redis(:last_failure)).to be_empty context 'the `failure_wait_time` is set to 0' do
before do
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end end
it 'wont connect to redis when there are no failures' do it 'is working even when there are failures' do
expect(Gitlab::Git::Storage.redis).to receive(:with).once Timecop.freeze do
.and_call_original set_in_redis(:last_failure, 0.seconds.ago.to_f)
expect(circuit_breaker).to receive(:track_storage_accessible) set_in_redis(:failure_count, 90)
.and_call_original
circuit_breaker.track_storage_accessible expect(circuit_breaker.backing_off?).to be_falsey
end end
end end
describe '#no_failures?' do
it 'is false when a failure was tracked' do
set_in_redis(:last_failure, Time.now.to_i)
set_in_redis(:failure_count, 1)
expect(circuit_breaker.no_failures?).to be_falsey
end end
end end
...@@ -351,10 +284,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -351,10 +284,4 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.failure_count).to eq(7) expect(circuit_breaker.failure_count).to eq(7)
end end
end end
describe '#cache_key' do
it 'includes storage and host' do
expect(circuit_breaker.cache_key).to eq(cache_key)
end
end
end end
...@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do ...@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
ours = described_class.public_instance_methods ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
# These methods are not part of the public API, but are public to allow the expect(theirs - ours).to be_empty
# CircuitBreaker specs to operate. They should be made private over time.
exceptions = %i[
cache_key
check_storage_accessible!
no_failures?
storage_available?
track_storage_accessible
track_storage_inaccessible
]
expect(theirs - ours).to contain_exactly(*exceptions)
end end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment