Commit 7fbefa3d authored by Sean McGivern's avatar Sean McGivern

Merge branch 'bvl-circuitbreaker-backoff' into 'master'

Circuitbreaker backoff and retries

Closes #37383 and #38231

See merge request gitlab-org/gitlab-ce!14933
parents cc170670 705c15d7
......@@ -120,6 +120,15 @@ module ApplicationSettingsHelper
message.html_safe
end
def circuitbreaker_access_retries_help_text
_('The number of attempts GitLab will make to access a storage.')
end
def circuitbreaker_backoff_threshold_help_text
_("The number of failures after which GitLab will start temporarily "\
"disabling access to a storage shard on a host")
end
def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\
......@@ -144,6 +153,8 @@ module ApplicationSettingsHelper
:akismet_api_key,
:akismet_enabled,
:auto_devops_enabled,
:circuitbreaker_access_retries,
:circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time,
......
......@@ -16,17 +16,16 @@ module StorageHealthHelper
def message_for_circuit_breaker(circuit_breaker)
maximum_failures = circuit_breaker.failure_count_threshold
current_failures = circuit_breaker.failure_count
permanently_broken = circuit_breaker.circuit_broken? && current_failures >= maximum_failures
translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures,
number_of_seconds: circuit_breaker.failure_wait_time }
if permanently_broken
if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params
elsif circuit_breaker.circuit_broken?
elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else
......
......@@ -153,13 +153,25 @@ class ApplicationSetting < ActiveRecord::Base
presence: true,
numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_failure_count_threshold,
validates :circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 }
validates :circuitbreaker_access_retries,
presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
if value.to_i >= record.circuitbreaker_failure_count_threshold
record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
"lower than the failure count threshold"))
end
end
SUPPORTED_KEY_TYPES.each do |type|
validates :"#{type}_key_restriction", presence: true, key_restriction: { type: type }
end
......
......@@ -533,11 +533,23 @@
%fieldset
%legend Git Storage Circuitbreaker settings
.form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
= f.number_field :circuitbreaker_access_retries, class: 'form-control'
.help-block
= circuitbreaker_failure_count_help_text
= circuitbreaker_access_retries_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block
= circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
.help-block
= circuitbreaker_backoff_threshold_help_text
.form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10
......@@ -545,17 +557,17 @@
.help-block
= circuitbreaker_failure_wait_time_help_text
.form-group
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2'
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
= f.number_field :circuitbreaker_failure_count_threshold, class: 'form-control'
.help-block
= circuitbreaker_failure_reset_time_help_text
= circuitbreaker_failure_count_help_text
.form-group
= f.label :circuitbreaker_storage_timeout, _('Seconds to wait for a storage access attempt'), class: 'control-label col-sm-2'
= f.label :circuitbreaker_failure_reset_time, _('Seconds before reseting failure information'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
= f.number_field :circuitbreaker_failure_reset_time, class: 'form-control'
.help-block
= circuitbreaker_storage_timeout_help_text
= circuitbreaker_failure_reset_time_help_text
%fieldset
%legend Repository Checks
......
---
title: Make the circuitbreaker more robust by adding higher thresholds, and multiple
access attempts.
merge_request: 14933
author:
type: fixed
class AddNewCircuitbreakerSettingsToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def change
add_column :application_settings,
:circuitbreaker_access_retries,
:integer,
default: 3
add_column :application_settings,
:circuitbreaker_backoff_threshold,
:integer,
default: 80
end
end
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20171012101043) do
ActiveRecord::Schema.define(version: 20171017145932) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -138,6 +138,8 @@ ActiveRecord::Schema.define(version: 20171012101043) do
t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30
t.integer "circuitbreaker_access_retries", default: 3
t.integer "circuitbreaker_backoff_threshold", default: 80
end
create_table "audit_events", force: :cascade do |t|
......
......@@ -109,6 +109,11 @@ This can be configured from the admin interface:
![circuitbreaker configuration](img/circuitbreaker_config.png)
**Number of access attempts**: The number of attempts GitLab will make to access a
storage when probing a shard.
**Number of failures before backing off**: The number of failures after which
GitLab will start temporarily disabling access to a storage shard on a host.
**Maximum git storage failures:** The number of failures of after which GitLab will
completely prevent access to the storage. The number of failures can be reset in
......@@ -126,6 +131,15 @@ mount is reset.
**Seconds to wait for a storage access attempt:** The time in seconds GitLab will
try to access storage. After this time a timeout error will be raised.
To enable the circuitbreaker for repository storage you can flip the feature flag from a rails console:
```
Feature.enable('git_storage_circuit_breaker')
```
Alternatively it can be enabled by setting `true` in the `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
This approach would be used when enabling the circuit breaker on a single host.
When storage failures occur, this will be visible in the admin interface like this:
![failing storage](img/failing_storage.png)
......
......@@ -69,6 +69,8 @@ PUT /application/settings
| `after_sign_up_text` | string | no | Text shown to the user after signing up |
| `akismet_api_key` | string | no | API key for akismet spam protection |
| `akismet_enabled` | boolean | no | Enable or disable akismet spam protection |
| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
......
......@@ -12,6 +12,7 @@ module Gitlab
CircuitOpen = Class.new(Inaccessible)
Misconfiguration = Class.new(Inaccessible)
Failing = Class.new(Inaccessible)
REDIS_KEY_PREFIX = 'storage_accessible:'.freeze
......
......@@ -54,7 +54,7 @@ module Gitlab
end
def perform
return yield unless Feature.enabled?('git_storage_circuit_breaker')
return yield unless enabled?
check_storage_accessible!
......@@ -64,10 +64,27 @@ module Gitlab
def circuit_broken?
return false if no_failures?
failure_count > failure_count_threshold
end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > failure_count_threshold
too_many_failures = failure_count > backoff_threshold
recent_failure || too_many_failures
recent_failure && too_many_failures
end
private
# The circuitbreaker can be enabled for the entire fleet using a Feature
# flag.
#
# Enabling it for a single host can be done setting the
# `GIT_STORAGE_CIRCUIT_BREAKER` environment variable.
def enabled?
ENV['GIT_STORAGE_CIRCUIT_BREAKER'].present? || Feature.enabled?('git_storage_circuit_breaker')
end
def failure_info
......@@ -83,7 +100,7 @@ module Gitlab
return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout)
.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
else
track_storage_inaccessible
......@@ -94,7 +111,11 @@ module Gitlab
def check_storage_accessible!
if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_wait_time)
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end
unless storage_available?
......@@ -131,12 +152,6 @@ module Gitlab
end
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
private
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
......@@ -146,6 +161,10 @@ module Gitlab
FailureInfo.new(last_failure, failure_count.to_i)
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end
end
end
end
......
......@@ -18,6 +18,14 @@ module Gitlab
application_settings.circuitbreaker_storage_timeout
end
def access_retries
application_settings.circuitbreaker_access_retries
end
def backoff_threshold
application_settings.circuitbreaker_backoff_threshold
end
private
def application_settings
......
......@@ -4,8 +4,17 @@ module Gitlab
module ForkedStorageCheck
extend self
def storage_available?(path, timeout_seconds = 5)
status = timeout_check(path, timeout_seconds)
def storage_available?(path, timeout_seconds = 5, retries = 1)
partial_timeout = timeout_seconds / retries
status = timeout_check(path, partial_timeout)
# If the status check did not succeed the first time, we retry a few
# more times to avoid one-off failures
current_attempts = 1
while current_attempts < retries && !status.success?
status = timeout_check(path, partial_timeout)
current_attempts += 1
end
status.success?
end
......
......@@ -25,6 +25,10 @@ module Gitlab
!!@error
end
def backing_off?
false
end
def last_failure
circuit_broken? ? Time.now : nil
end
......
......@@ -33,6 +33,21 @@ describe Gitlab::Git::Storage::ForkedStorageCheck, broken_storage: true, skip_da
expect(runtime).to be < 1.0
end
it 'will try the specified amount of times before failing' do
allow(described_class).to receive(:check_filesystem_in_process) do
Process.spawn("sleep 10")
end
expect(Process).to receive(:spawn).with('sleep 10').twice
.and_call_original
runtime = Benchmark.realtime do
described_class.storage_available?(existing_path, 0.5, 2)
end
expect(runtime).to be < 1.0
end
describe 'when using paths with spaces' do
let(:test_dir) { Rails.root.join('tmp', 'tests', 'storage_check') }
let(:path_with_spaces) { File.join(test_dir, 'path with spaces') }
......
......@@ -65,17 +65,6 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
ours = described_class.public_instance_methods
theirs = Gitlab::Git::Storage::CircuitBreaker.public_instance_methods
# These methods are not part of the public API, but are public to allow the
# CircuitBreaker specs to operate. They should be made private over time.
exceptions = %i[
cache_key
check_storage_accessible!
no_failures?
storage_available?
track_storage_accessible
track_storage_inaccessible
]
expect(theirs - ours).to contain_exactly(*exceptions)
expect(theirs - ours).to be_empty
end
end
......@@ -115,7 +115,8 @@ describe ApplicationSetting do
end
context 'circuitbreaker settings' do
[:circuitbreaker_failure_count_threshold,
[:circuitbreaker_backoff_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout].each do |field|
......@@ -125,6 +126,16 @@ describe ApplicationSetting do
.is_greater_than_or_equal_to(0)
end
end
it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do
setting.circuitbreaker_failure_count_threshold = 10
setting.circuitbreaker_backoff_threshold = 15
failure_message = "The circuitbreaker backoff threshold should be lower "\
"than the failure count threshold"
expect(setting).not_to be_valid
expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message)
end
end
context 'repository storages' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment