Commit 7fabc892 authored by Douwe Maan's avatar Douwe Maan

Merge branch 'bvl-circuitbreaker-process' into 'master'

Check NFS mounts in a separate process

Closes #39847

See merge request gitlab-org/gitlab-ce!15426
parents 12d33b88 f1ae1e39
...@@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController ...@@ -5,7 +5,7 @@ class Admin::HealthCheckController < Admin::ApplicationController
end end
def reset_storage_health def reset_storage_health
Gitlab::Git::Storage::CircuitBreaker.reset_all! Gitlab::Git::Storage::FailureInfo.reset_all!
redirect_to admin_health_check_path, redirect_to admin_health_check_path,
notice: _('Git storage health information has been reset') notice: _('Git storage health information has been reset')
end end
......
class HealthController < ActionController::Base class HealthController < ActionController::Base
protect_from_forgery with: :exception protect_from_forgery with: :exception, except: :storage_check
include RequiresWhitelistedMonitoringClient include RequiresWhitelistedMonitoringClient
CHECKS = [ CHECKS = [
...@@ -23,6 +23,15 @@ class HealthController < ActionController::Base ...@@ -23,6 +23,15 @@ class HealthController < ActionController::Base
render_check_results(results) render_check_results(results)
end end
def storage_check
results = Gitlab::Git::Storage::Checker.check_all
render json: {
check_interval: Gitlab::CurrentSettings.current_application_settings.circuitbreaker_check_interval,
results: results
}
end
private private
def render_check_results(results) def render_check_results(results)
......
...@@ -124,17 +124,6 @@ module ApplicationSettingsHelper ...@@ -124,17 +124,6 @@ module ApplicationSettingsHelper
_('The number of attempts GitLab will make to access a storage.') _('The number of attempts GitLab will make to access a storage.')
end end
def circuitbreaker_backoff_threshold_help_text
_("The number of failures after which GitLab will start temporarily "\
"disabling access to a storage shard on a host")
end
def circuitbreaker_failure_wait_time_help_text
_("When access to a storage fails. GitLab will prevent access to the "\
"storage for the time specified here. This allows the filesystem to "\
"recover. Repositories on failing shards are temporarly unavailable")
end
def circuitbreaker_failure_reset_time_help_text def circuitbreaker_failure_reset_time_help_text
_("The time in seconds GitLab will keep failure information. When no "\ _("The time in seconds GitLab will keep failure information. When no "\
"failures occur during this time, information about the mount is reset.") "failures occur during this time, information about the mount is reset.")
...@@ -145,6 +134,11 @@ module ApplicationSettingsHelper ...@@ -145,6 +134,11 @@ module ApplicationSettingsHelper
"timeout error will be raised.") "timeout error will be raised.")
end end
def circuitbreaker_check_interval_help_text
_("The time in seconds between storage checks. When a previous check did "\
"complete yet, GitLab will skip a check.")
end
def visible_attributes def visible_attributes
[ [
:admin_notification_email, :admin_notification_email,
...@@ -154,10 +148,9 @@ module ApplicationSettingsHelper ...@@ -154,10 +148,9 @@ module ApplicationSettingsHelper
:akismet_enabled, :akismet_enabled,
:auto_devops_enabled, :auto_devops_enabled,
:circuitbreaker_access_retries, :circuitbreaker_access_retries,
:circuitbreaker_backoff_threshold, :circuitbreaker_check_interval,
:circuitbreaker_failure_count_threshold, :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_failure_wait_time,
:circuitbreaker_storage_timeout, :circuitbreaker_storage_timeout,
:clientside_sentry_dsn, :clientside_sentry_dsn,
:clientside_sentry_enabled, :clientside_sentry_enabled,
......
...@@ -18,16 +18,12 @@ module StorageHealthHelper ...@@ -18,16 +18,12 @@ module StorageHealthHelper
current_failures = circuit_breaker.failure_count current_failures = circuit_breaker.failure_count
translation_params = { number_of_failures: current_failures, translation_params = { number_of_failures: current_failures,
maximum_failures: maximum_failures, maximum_failures: maximum_failures }
number_of_seconds: circuit_breaker.failure_wait_time }
if circuit_breaker.circuit_broken? if circuit_breaker.circuit_broken?
s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\ s_("%{number_of_failures} of %{maximum_failures} failures. GitLab will not "\
"retry automatically. Reset storage information when the problem is "\ "retry automatically. Reset storage information when the problem is "\
"resolved.") % translation_params "resolved.") % translation_params
elsif circuit_breaker.backing_off?
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"block access for %{number_of_seconds} seconds.") % translation_params
else else
_("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\ _("%{number_of_failures} of %{maximum_failures} failures. GitLab will "\
"allow access on the next attempt.") % translation_params "allow access on the next attempt.") % translation_params
......
...@@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base ...@@ -153,11 +153,10 @@ class ApplicationSetting < ActiveRecord::Base
presence: true, presence: true,
numericality: { greater_than_or_equal_to: 0 } numericality: { greater_than_or_equal_to: 0 }
validates :circuitbreaker_backoff_threshold, validates :circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout, :circuitbreaker_storage_timeout,
:circuitbreaker_check_interval,
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 } numericality: { only_integer: true, greater_than_or_equal_to: 0 }
...@@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base ...@@ -165,13 +164,6 @@ class ApplicationSetting < ActiveRecord::Base
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 1 } numericality: { only_integer: true, greater_than_or_equal_to: 1 }
validates_each :circuitbreaker_backoff_threshold do |record, attr, value|
if value.to_i >= record.circuitbreaker_failure_count_threshold
record.errors.add(attr, _("The circuitbreaker backoff threshold should be "\
"lower than the failure count threshold"))
end
end
validates :gitaly_timeout_default, validates :gitaly_timeout_default,
presence: true, presence: true,
numericality: { only_integer: true, greater_than_or_equal_to: 0 } numericality: { only_integer: true, greater_than_or_equal_to: 0 }
......
...@@ -545,6 +545,12 @@ ...@@ -545,6 +545,12 @@
%fieldset %fieldset
%legend Git Storage Circuitbreaker settings %legend Git Storage Circuitbreaker settings
.form-group
= f.label :circuitbreaker_check_interval, _('Check interval'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_check_interval, class: 'form-control'
.help-block
= circuitbreaker_check_interval_help_text
.form-group .form-group
= f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2' = f.label :circuitbreaker_access_retries, _('Number of access attempts'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
...@@ -557,18 +563,6 @@ ...@@ -557,18 +563,6 @@
= f.number_field :circuitbreaker_storage_timeout, class: 'form-control' = f.number_field :circuitbreaker_storage_timeout, class: 'form-control'
.help-block .help-block
= circuitbreaker_storage_timeout_help_text = circuitbreaker_storage_timeout_help_text
.form-group
= f.label :circuitbreaker_backoff_threshold, _('Number of failures before backing off'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_backoff_threshold, class: 'form-control'
.help-block
= circuitbreaker_backoff_threshold_help_text
.form-group
= f.label :circuitbreaker_failure_wait_time, _('Seconds to wait after a storage failure'), class: 'control-label col-sm-2'
.col-sm-10
= f.number_field :circuitbreaker_failure_wait_time, class: 'form-control'
.help-block
= circuitbreaker_failure_wait_time_help_text
.form-group .form-group
= f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2' = f.label :circuitbreaker_failure_count_threshold, _('Maximum git storage failures'), class: 'control-label col-sm-2'
.col-sm-10 .col-sm-10
......
#!/usr/bin/env ruby
require 'optparse'
require 'net/http'
require 'json'
require 'socket'
require 'logger'
require_relative '../lib/gitlab/storage_check'
Gitlab::StorageCheck::CLI.start!(ARGV)
---
title: Monitor NFS shards for circuitbreaker in a separate process
merge_request: 15426
author:
type: changed
...@@ -42,6 +42,7 @@ Rails.application.routes.draw do ...@@ -42,6 +42,7 @@ Rails.application.routes.draw do
scope path: '-' do scope path: '-' do
get 'liveness' => 'health#liveness' get 'liveness' => 'health#liveness'
get 'readiness' => 'health#readiness' get 'readiness' => 'health#readiness'
post 'storage_check' => 'health#storage_check'
resources :metrics, only: [:index] resources :metrics, only: [:index]
mount Peek::Railtie => '/peek' mount Peek::Railtie => '/peek'
......
class AddCircuitbreakerCheckIntervalToApplicationSettings < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
disable_ddl_transaction!
def up
add_column_with_default :application_settings,
:circuitbreaker_check_interval,
:integer,
default: 1
end
def down
remove_column :application_settings,
:circuitbreaker_check_interval
end
end
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class UpdateCircuitbreakerDefaults < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
class ApplicationSetting < ActiveRecord::Base; end
def up
change_column_default :application_settings,
:circuitbreaker_failure_count_threshold,
3
change_column_default :application_settings,
:circuitbreaker_storage_timeout,
15
ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 3,
circuitbreaker_storage_timeout: 15)
end
def down
change_column_default :application_settings,
:circuitbreaker_failure_count_threshold,
160
change_column_default :application_settings,
:circuitbreaker_storage_timeout,
30
ApplicationSetting.update_all(circuitbreaker_failure_count_threshold: 160,
circuitbreaker_storage_timeout: 30)
end
end
# See http://doc.gitlab.com/ce/development/migration_style_guide.html
# for more information on how to write migrations for GitLab.
class RemoveOldCircuitbreakerConfig < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
def up
remove_column :application_settings,
:circuitbreaker_backoff_threshold
remove_column :application_settings,
:circuitbreaker_failure_wait_time
end
def down
add_column :application_settings,
:circuitbreaker_backoff_threshold,
:integer,
default: 80
add_column :application_settings,
:circuitbreaker_failure_wait_time,
:integer,
default: 30
end
end
...@@ -135,12 +135,10 @@ ActiveRecord::Schema.define(version: 20171205190711) do ...@@ -135,12 +135,10 @@ ActiveRecord::Schema.define(version: 20171205190711) do
t.boolean "hashed_storage_enabled", default: false, null: false t.boolean "hashed_storage_enabled", default: false, null: false
t.boolean "project_export_enabled", default: true, null: false t.boolean "project_export_enabled", default: true, null: false
t.boolean "auto_devops_enabled", default: false, null: false t.boolean "auto_devops_enabled", default: false, null: false
t.integer "circuitbreaker_failure_count_threshold", default: 160 t.integer "circuitbreaker_failure_count_threshold", default: 3
t.integer "circuitbreaker_failure_wait_time", default: 30
t.integer "circuitbreaker_failure_reset_time", default: 1800 t.integer "circuitbreaker_failure_reset_time", default: 1800
t.integer "circuitbreaker_storage_timeout", default: 30 t.integer "circuitbreaker_storage_timeout", default: 15
t.integer "circuitbreaker_access_retries", default: 3 t.integer "circuitbreaker_access_retries", default: 3
t.integer "circuitbreaker_backoff_threshold", default: 80
t.boolean "throttle_unauthenticated_enabled", default: false, null: false t.boolean "throttle_unauthenticated_enabled", default: false, null: false
t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false t.integer "throttle_unauthenticated_requests_per_period", default: 3600, null: false
t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false t.integer "throttle_unauthenticated_period_in_seconds", default: 3600, null: false
...@@ -150,6 +148,7 @@ ActiveRecord::Schema.define(version: 20171205190711) do ...@@ -150,6 +148,7 @@ ActiveRecord::Schema.define(version: 20171205190711) do
t.boolean "throttle_authenticated_web_enabled", default: false, null: false t.boolean "throttle_authenticated_web_enabled", default: false, null: false
t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false t.integer "throttle_authenticated_web_requests_per_period", default: 7200, null: false
t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false t.integer "throttle_authenticated_web_period_in_seconds", default: 3600, null: false
t.integer "circuitbreaker_check_interval", default: 1, null: false
t.boolean "password_authentication_enabled_for_web" t.boolean "password_authentication_enabled_for_web"
t.boolean "password_authentication_enabled_for_git", default: true t.boolean "password_authentication_enabled_for_git", default: true
t.integer "gitaly_timeout_default", default: 55, null: false t.integer "gitaly_timeout_default", default: 55, null: false
......
...@@ -70,10 +70,9 @@ PUT /application/settings ...@@ -70,10 +70,9 @@ PUT /application/settings
| `akismet_api_key` | string | no | API key for akismet spam protection | | `akismet_api_key` | string | no | API key for akismet spam protection |
| `akismet_enabled` | boolean | no | Enable or disable akismet spam protection | | `akismet_enabled` | boolean | no | Enable or disable akismet spam protection |
| `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. | | `circuitbreaker_access_retries | integer | no | The number of attempts GitLab will make to access a storage. |
| `circuitbreaker_backoff_threshold | integer | no | The number of failures after which GitLab will start temporarily disabling access to a storage shard on a host. | | `circuitbreaker_check_interval` | integer | no | Number of seconds in between storage checks. |
| `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. | | `circuitbreaker_failure_count_threshold` | integer | no | The number of failures of after which GitLab will completely prevent access to the storage. |
| `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. | | `circuitbreaker_failure_reset_time` | integer | no | Time in seconds GitLab will keep storage failure information. When no failures occur during this time, the failure information is reset. |
| `circuitbreaker_failure_wait_time` | integer | no | Time in seconds GitLab will block access to a failing storage to allow it to recover. |
| `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt | | `circuitbreaker_storage_timeout` | integer | no | Seconds to wait for a storage access attempt |
| `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled | | `clientside_sentry_dsn` | string | no | Required if `clientside_sentry_dsn` is enabled |
| `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side | | `clientside_sentry_enabled` | boolean | no | Enable Sentry error reporting for the client side |
......
...@@ -41,7 +41,7 @@ module API ...@@ -41,7 +41,7 @@ module API
detail 'This feature was introduced in GitLab 9.5' detail 'This feature was introduced in GitLab 9.5'
end end
delete do delete do
Gitlab::Git::Storage::CircuitBreaker.reset_all! Gitlab::Git::Storage::FailureInfo.reset_all!
end end
end end
end end
......
module Gitlab
module Git
module Storage
class Checker
include CircuitBreakerSettings
attr_reader :storage_path, :storage, :hostname, :logger
def self.check_all(logger = Rails.logger)
threads = Gitlab.config.repositories.storages.keys.map do |storage_name|
Thread.new do
Thread.current[:result] = new(storage_name, logger).check_with_lease
end
end
threads.map do |thread|
thread.join
thread[:result]
end
end
def initialize(storage, logger = Rails.logger)
@storage = storage
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
@logger = logger
@hostname = Gitlab::Environment.hostname
end
def check_with_lease
lease_key = "storage_check:#{cache_key}"
lease = Gitlab::ExclusiveLease.new(lease_key, timeout: storage_timeout)
result = { storage: storage, success: nil }
if uuid = lease.try_obtain
result[:success] = check
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
else
logger.warn("#{hostname}: #{storage}: Skipping check, previous check still running")
end
result
end
def check
if Gitlab::Git::Storage::ForkedStorageCheck.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
true
else
track_storage_inaccessible
logger.error("#{hostname}: #{storage}: Not accessible.")
false
end
end
private
def track_storage_inaccessible
first_failure = current_failure_info.first_failure || Time.now
last_failure = Time.now
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :first_failure, nil)
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
maintain_known_keys(redis)
end
end
end
def maintain_known_keys(redis)
expire_time = Time.now.to_i + failure_reset_time
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
end
def current_failure_info
FailureInfo.load(cache_key)
end
end
end
end
end
...@@ -4,22 +4,11 @@ module Gitlab ...@@ -4,22 +4,11 @@ module Gitlab
class CircuitBreaker class CircuitBreaker
include CircuitBreakerSettings include CircuitBreakerSettings
FailureInfo = Struct.new(:last_failure, :failure_count)
attr_reader :storage, attr_reader :storage,
:hostname, :hostname
:storage_path
delegate :last_failure, :failure_count, to: :failure_info
def self.reset_all!
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
RequestStore.delete(:circuitbreaker_cache) delegate :last_failure, :failure_count, :no_failures?,
end to: :failure_info
def self.for_storage(storage) def self.for_storage(storage)
cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do cached_circuitbreakers = RequestStore.fetch(:circuitbreaker_cache) do
...@@ -46,9 +35,6 @@ module Gitlab ...@@ -46,9 +35,6 @@ module Gitlab
def initialize(storage, hostname) def initialize(storage, hostname)
@storage = storage @storage = storage
@hostname = hostname @hostname = hostname
config = Gitlab.config.repositories.storages[@storage]
@storage_path = config['path']
end end
def perform def perform
...@@ -65,15 +51,6 @@ module Gitlab ...@@ -65,15 +51,6 @@ module Gitlab
failure_count > failure_count_threshold failure_count > failure_count_threshold
end end
def backing_off?
return false if no_failures?
recent_failure = last_failure > failure_wait_time.seconds.ago
too_many_failures = failure_count > backoff_threshold
recent_failure && too_many_failures
end
private private
# The circuitbreaker can be enabled for the entire fleet using a Feature # The circuitbreaker can be enabled for the entire fleet using a Feature
...@@ -86,88 +63,13 @@ module Gitlab ...@@ -86,88 +63,13 @@ module Gitlab
end end
def failure_info def failure_info
@failure_info ||= get_failure_info @failure_info ||= FailureInfo.load(cache_key)
end
# Memoizing the `storage_available` call means we only do it once per
# request when the storage is available.
#
# When the storage appears not available, and the memoized value is `false`
# we might want to try again.
def storage_available?
return @storage_available if @storage_available
if @storage_available = Gitlab::Git::Storage::ForkedStorageCheck
.storage_available?(storage_path, storage_timeout, access_retries)
track_storage_accessible
else
track_storage_inaccessible
end
@storage_available
end end
def check_storage_accessible! def check_storage_accessible!
if circuit_broken? if circuit_broken?
raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time) raise Gitlab::Git::Storage::CircuitOpen.new("Circuit for #{storage} is broken", failure_reset_time)
end end
if backing_off?
raise Gitlab::Git::Storage::Failing.new("Backing off access to #{storage}", failure_wait_time)
end
unless storage_available?
raise Gitlab::Git::Storage::Inaccessible.new("#{storage} not accessible", failure_wait_time)
end
end
def no_failures?
last_failure.blank? && failure_count == 0
end
def track_storage_inaccessible
@failure_info = FailureInfo.new(Time.now, failure_count + 1)
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hincrby(cache_key, :failure_count, 1)
redis.expire(cache_key, failure_reset_time)
maintain_known_keys(redis)
end
end
end
def track_storage_accessible
@failure_info = FailureInfo.new(nil, 0)
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.hset(cache_key, :last_failure, nil)
redis.hset(cache_key, :failure_count, 0)
maintain_known_keys(redis)
end
end
end
def maintain_known_keys(redis)
expire_time = Time.now.to_i + failure_reset_time
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, expire_time, cache_key)
redis.zremrangebyscore(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, '-inf', Time.now.to_i)
end
def get_failure_info
last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :last_failure, :failure_count)
end
last_failure = Time.at(last_failure.to_i) if last_failure.present?
FailureInfo.new(last_failure, failure_count.to_i)
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end end
end end
end end
......
...@@ -6,10 +6,6 @@ module Gitlab ...@@ -6,10 +6,6 @@ module Gitlab
application_settings.circuitbreaker_failure_count_threshold application_settings.circuitbreaker_failure_count_threshold
end end
def failure_wait_time
application_settings.circuitbreaker_failure_wait_time
end
def failure_reset_time def failure_reset_time
application_settings.circuitbreaker_failure_reset_time application_settings.circuitbreaker_failure_reset_time
end end
...@@ -22,8 +18,12 @@ module Gitlab ...@@ -22,8 +18,12 @@ module Gitlab
application_settings.circuitbreaker_access_retries application_settings.circuitbreaker_access_retries
end end
def backoff_threshold def check_interval
application_settings.circuitbreaker_backoff_threshold application_settings.circuitbreaker_check_interval
end
def cache_key
@cache_key ||= "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}#{storage}:#{hostname}"
end end
private private
......
module Gitlab
module Git
module Storage
class FailureInfo
attr_accessor :first_failure, :last_failure, :failure_count
def self.reset_all!
Gitlab::Git::Storage.redis.with do |redis|
all_storage_keys = redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
redis.del(*all_storage_keys) unless all_storage_keys.empty?
end
RequestStore.delete(:circuitbreaker_cache)
end
def self.load(cache_key)
first_failure, last_failure, failure_count = Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, :first_failure, :last_failure, :failure_count)
end
last_failure = Time.at(last_failure.to_i) if last_failure.present?
first_failure = Time.at(first_failure.to_i) if first_failure.present?
new(first_failure, last_failure, failure_count.to_i)
end
def initialize(first_failure, last_failure, failure_count)
@first_failure = first_failure
@last_failure = last_failure
@failure_count = failure_count
end
def no_failures?
first_failure.blank? && last_failure.blank? && failure_count == 0
end
end
end
end
end
...@@ -11,6 +11,9 @@ module Gitlab ...@@ -11,6 +11,9 @@ module Gitlab
# These will always have nil values # These will always have nil values
attr_reader :storage_path attr_reader :storage_path
delegate :last_failure, :failure_count, :no_failures?,
to: :failure_info
def initialize(storage, hostname, error: nil) def initialize(storage, hostname, error: nil)
@storage = storage @storage = storage
@hostname = hostname @hostname = hostname
...@@ -29,16 +32,17 @@ module Gitlab ...@@ -29,16 +32,17 @@ module Gitlab
false false
end end
def last_failure
circuit_broken? ? Time.now : nil
end
def failure_count
circuit_broken? ? failure_count_threshold : 0
end
def failure_info def failure_info
Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(last_failure, failure_count) @failure_info ||=
if circuit_broken?
Gitlab::Git::Storage::FailureInfo.new(Time.now,
Time.now,
failure_count_threshold)
else
Gitlab::Git::Storage::FailureInfo.new(nil,
nil,
0)
end
end end
end end
end end
......
require_relative 'storage_check/cli'
require_relative 'storage_check/gitlab_caller'
require_relative 'storage_check/option_parser'
require_relative 'storage_check/response'
module Gitlab
module StorageCheck
ENDPOINT = '/-/storage_check'.freeze
Options = Struct.new(:target, :token, :interval, :dryrun)
end
end
module Gitlab
module StorageCheck
class CLI
def self.start!(args)
runner = new(Gitlab::StorageCheck::OptionParser.parse!(args))
runner.start_loop
end
attr_reader :logger, :options
def initialize(options)
@options = options
@logger = Logger.new(STDOUT)
end
def start_loop
logger.info "Checking #{options.target} every #{options.interval} seconds"
if options.dryrun
logger.info "Dryrun, exiting..."
return
end
begin
loop do
response = GitlabCaller.new(options).call!
log_response(response)
update_settings(response)
sleep options.interval
end
rescue Interrupt
logger.info "Ending storage-check"
end
end
def update_settings(response)
previous_interval = options.interval
if response.valid?
options.interval = response.check_interval || previous_interval
end
if previous_interval != options.interval
logger.info "Interval changed: #{options.interval} seconds"
end
end
def log_response(response)
unless response.valid?
return logger.error("Invalid response checking nfs storage: #{response.http_response.inspect}")
end
if response.responsive_shards.any?
logger.debug("Responsive shards: #{response.responsive_shards.join(', ')}")
end
warnings = []
if response.skipped_shards.any?
warnings << "Skipped shards: #{response.skipped_shards.join(', ')}"
end
if response.failing_shards.any?
warnings << "Failing shards: #{response.failing_shards.join(', ')}"
end
logger.warn(warnings.join(' - ')) if warnings.any?
end
end
end
end
require 'excon'
module Gitlab
module StorageCheck
class GitlabCaller
def initialize(options)
@options = options
end
def call!
Gitlab::StorageCheck::Response.new(get_response)
rescue Errno::ECONNREFUSED, Excon::Error
# Server not ready, treated as invalid response.
Gitlab::StorageCheck::Response.new(nil)
end
def get_response
scheme, *other_parts = URI.split(@options.target)
socket_path = if scheme == 'unix'
other_parts.compact.join
end
connection = Excon.new(@options.target, socket: socket_path)
connection.post(path: Gitlab::StorageCheck::ENDPOINT,
headers: headers)
end
def headers
@headers ||= begin
headers = {}
headers['Content-Type'] = headers['Accept'] = 'application/json'
headers['TOKEN'] = @options.token if @options.token
headers
end
end
end
end
end
module Gitlab
module StorageCheck
class OptionParser
def self.parse!(args)
# Start out with some defaults
options = Gitlab::StorageCheck::Options.new(nil, nil, 1, false)
parser = ::OptionParser.new do |opts|
opts.banner = "Usage: bin/storage_check [options]"
opts.on('-t=string', '--target string', 'URL or socket to trigger storage check') do |value|
options.target = value
end
opts.on('-T=string', '--token string', 'Health token to use') { |value| options.token = value }
opts.on('-i=n', '--interval n', ::OptionParser::DecimalInteger, 'Seconds between checks') do |value|
options.interval = value
end
opts.on('-d', '--dryrun', "Output what will be performed, but don't start the process") do |value|
options.dryrun = value
end
end
parser.parse!(args)
unless options.target
raise ::OptionParser::InvalidArgument.new('Provide a URI to provide checks')
end
if URI.parse(options.target).scheme.nil?
raise ::OptionParser::InvalidArgument.new('Add the scheme to the target, `unix://`, `https://` or `http://` are supported')
end
options
end
end
end
end
require 'json'
module Gitlab
module StorageCheck
class Response
attr_reader :http_response
def initialize(http_response)
@http_response = http_response
end
def valid?
@http_response && (200...299).cover?(@http_response.status) &&
@http_response.headers['Content-Type'].include?('application/json') &&
parsed_response
end
def check_interval
return nil unless parsed_response
parsed_response['check_interval']
end
def responsive_shards
divided_results[:responsive_shards]
end
def skipped_shards
divided_results[:skipped_shards]
end
def failing_shards
divided_results[:failing_shards]
end
private
def results
return [] unless parsed_response
parsed_response['results']
end
def divided_results
return @divided_results if @divided_results
@divided_results = {}
@divided_results[:responsive_shards] = []
@divided_results[:skipped_shards] = []
@divided_results[:failing_shards] = []
results.each do |info|
name = info['storage']
case info['success']
when true
@divided_results[:responsive_shards] << name
when false
@divided_results[:failing_shards] << name
else
@divided_results[:skipped_shards] << name
end
end
@divided_results
end
def parsed_response
return @parsed_response if defined?(@parsed_response)
@parsed_response = JSON.parse(@http_response.body)
rescue JSON::JSONError
@parsed_response = nil
end
end
end
end
require 'spec_helper'
describe 'bin/storage_check' do
it 'is executable' do
command = %w[bin/storage_check -t unix://the/path/to/a/unix-socket.sock -i 10 -d]
expected_output = 'Checking unix://the/path/to/a/unix-socket.sock every 10 seconds'
output, status = Gitlab::Popen.popen(command, Rails.root.to_s)
expect(status).to eq(0)
expect(output).to include(expected_output)
end
end
require 'spec_helper' require 'spec_helper'
describe Admin::HealthCheckController, broken_storage: true do describe Admin::HealthCheckController do
let(:admin) { create(:admin) } let(:admin) { create(:admin) }
before do before do
...@@ -17,7 +17,7 @@ describe Admin::HealthCheckController, broken_storage: true do ...@@ -17,7 +17,7 @@ describe Admin::HealthCheckController, broken_storage: true do
describe 'POST reset_storage_health' do describe 'POST reset_storage_health' do
it 'resets all storage health information' do it 'resets all storage health information' do
expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!) expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!)
post :reset_storage_health post :reset_storage_health
end end
......
...@@ -14,6 +14,48 @@ describe HealthController do ...@@ -14,6 +14,48 @@ describe HealthController do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false') stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'false')
end end
describe '#storage_check' do
before do
allow(Gitlab::RequestContext).to receive(:client_ip).and_return(whitelisted_ip)
end
subject { post :storage_check }
it 'checks all the configured storages' do
expect(Gitlab::Git::Storage::Checker).to receive(:check_all).and_call_original
subject
end
it 'returns the check interval' do
stub_env('IN_MEMORY_APPLICATION_SETTINGS', 'true')
stub_application_setting(circuitbreaker_check_interval: 10)
subject
expect(json_response['check_interval']).to eq(10)
end
context 'with failing storages', :broken_storage do
before do
stub_storage_settings(
broken: { path: 'tmp/tests/non-existent-repositories' }
)
end
it 'includes the failure information' do
subject
expected_results = [
{ 'storage' => 'broken', 'success' => false },
{ 'storage' => 'default', 'success' => true }
]
expect(json_response['results']).to eq(expected_results)
end
end
end
describe '#readiness' do describe '#readiness' do
shared_context 'endpoint responding with readiness data' do shared_context 'endpoint responding with readiness data' do
let(:request_params) { {} } let(:request_params) { {} }
......
require 'spec_helper' require 'spec_helper'
feature "Admin Health Check", :feature, :broken_storage do feature "Admin Health Check", :feature do
include StubENV include StubENV
before do before do
...@@ -36,6 +36,7 @@ feature "Admin Health Check", :feature, :broken_storage do ...@@ -36,6 +36,7 @@ feature "Admin Health Check", :feature, :broken_storage do
context 'when services are up' do context 'when services are up' do
before do before do
stub_storage_settings({}) # Hide the broken storage
visit admin_health_check_path visit admin_health_check_path
end end
...@@ -56,10 +57,8 @@ feature "Admin Health Check", :feature, :broken_storage do ...@@ -56,10 +57,8 @@ feature "Admin Health Check", :feature, :broken_storage do
end end
end end
context 'with repository storage failures' do context 'with repository storage failures', :broken_storage do
before do before do
# Track a failure
Gitlab::Git::Storage::CircuitBreaker.for_storage('broken').perform { nil } rescue nil
visit admin_health_check_path visit admin_health_check_path
end end
...@@ -67,9 +66,10 @@ feature "Admin Health Check", :feature, :broken_storage do ...@@ -67,9 +66,10 @@ feature "Admin Health Check", :feature, :broken_storage do
hostname = Gitlab::Environment.hostname hostname = Gitlab::Environment.hostname
maximum_failures = Gitlab::CurrentSettings.current_application_settings maximum_failures = Gitlab::CurrentSettings.current_application_settings
.circuitbreaker_failure_count_threshold .circuitbreaker_failure_count_threshold
number_of_failures = maximum_failures + 1
expect(page).to have_content('broken: failed storage access attempt on host:') expect(page).to have_content("broken: #{number_of_failures} failed storage access attempts:")
expect(page).to have_content("#{hostname}: 1 of #{maximum_failures} failures.") expect(page).to have_content("#{hostname}: #{number_of_failures} of #{maximum_failures} failures.")
end end
it 'allows resetting storage failures' do it 'allows resetting storage failures' do
......
require 'spec_helper'
describe Gitlab::Git::Storage::Checker, :clean_gitlab_redis_shared_state do
let(:storage_name) { 'default' }
let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
subject(:checker) { described_class.new(storage_name) }
def value_from_redis(name)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, name)
end.first
end
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmset(cache_key, name, value)
end.first
end
describe '.check_all' do
it 'calls a check for each storage' do
fake_checker_default = double
fake_checker_broken = double
fake_logger = fake_logger
expect(described_class).to receive(:new).with('default', fake_logger) { fake_checker_default }
expect(described_class).to receive(:new).with('broken', fake_logger) { fake_checker_broken }
expect(fake_checker_default).to receive(:check_with_lease)
expect(fake_checker_broken).to receive(:check_with_lease)
described_class.check_all(fake_logger)
end
context 'with broken storage', :broken_storage do
it 'returns the results' do
expected_result = [
{ storage: 'default', success: true },
{ storage: 'broken', success: false }
]
expect(described_class.check_all).to eq(expected_result)
end
end
end
describe '#initialize' do
it 'assigns the settings' do
expect(checker.hostname).to eq(hostname)
expect(checker.storage).to eq('default')
expect(checker.storage_path).to eq(TestEnv.repos_path)
end
end
describe '#check_with_lease' do
it 'only allows one check at a time' do
expect(checker).to receive(:check).once { sleep 1 }
thread = Thread.new { checker.check_with_lease }
checker.check_with_lease
thread.join
end
it 'returns a result hash' do
expect(checker.check_with_lease).to eq(storage: 'default', success: true)
end
end
describe '#check' do
it 'tracks that the storage was accessible' do
set_in_redis(:failure_count, 10)
set_in_redis(:last_failure, Time.now.to_f)
checker.check
expect(value_from_redis(:failure_count).to_i).to eq(0)
expect(value_from_redis(:last_failure)).to be_empty
expect(value_from_redis(:first_failure)).to be_empty
end
it 'calls the check with the correct arguments' do
stub_application_setting(circuitbreaker_storage_timeout: 30,
circuitbreaker_access_retries: 3)
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).with(TestEnv.repos_path, 30, 3)
.and_call_original
checker.check
end
it 'returns `true`' do
expect(checker.check).to eq(true)
end
it 'maintains known storage keys' do
Timecop.freeze do
# Insert an old key to expire
old_entry = Time.now.to_i - 3.days.to_i
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed')
end
checker.check
known_keys = Gitlab::Git::Storage.redis.with do |redis|
redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
end
expect(known_keys).to contain_exactly(cache_key)
end
end
context 'the storage is not available', :broken_storage do
let(:storage_name) { 'broken' }
it 'tracks that the storage was inaccessible' do
Timecop.freeze do
expect { checker.check }.to change { value_from_redis(:failure_count).to_i }.by(1)
expect(value_from_redis(:last_failure)).not_to be_empty
expect(value_from_redis(:first_failure)).not_to be_empty
end
end
it 'returns `false`' do
expect(checker.check).to eq(false)
end
end
end
end
require 'spec_helper' require 'spec_helper'
describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: true, broken_storage: true do describe Gitlab::Git::Storage::CircuitBreaker, :broken_storage do
let(:storage_name) { 'default' } let(:storage_name) { 'default' }
let(:circuit_breaker) { described_class.new(storage_name, hostname) } let(:circuit_breaker) { described_class.new(storage_name, hostname) }
let(:hostname) { Gitlab::Environment.hostname } let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" } let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hmset(cache_key, name, value)
end.first
end
before do before do
# Override test-settings for the circuitbreaker with something more realistic # Override test-settings for the circuitbreaker with something more realistic
# for these specs. # for these specs.
...@@ -19,36 +26,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -19,36 +26,7 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
) )
end end
def value_from_redis(name) describe '.for_storage', :request_store do
Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, name)
end.first
end
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hmset(cache_key, name, value)
end.first
end
describe '.reset_all!' do
it 'clears all entries form redis' do
set_in_redis(:failure_count, 10)
described_class.reset_all!
key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
expect(key_exists).to be_falsey
end
it 'does not break when there are no keys in redis' do
expect { described_class.reset_all! }.not_to raise_error
end
end
describe '.for_storage' do
it 'only builds a single circuitbreaker per storage' do it 'only builds a single circuitbreaker per storage' do
expect(described_class).to receive(:new).once.and_call_original expect(described_class).to receive(:new).once.and_call_original
...@@ -71,7 +49,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -71,7 +49,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
it 'assigns the settings' do it 'assigns the settings' do
expect(circuit_breaker.hostname).to eq(hostname) expect(circuit_breaker.hostname).to eq(hostname)
expect(circuit_breaker.storage).to eq('default') expect(circuit_breaker.storage).to eq('default')
expect(circuit_breaker.storage_path).to eq(TestEnv.repos_path)
end end
end end
...@@ -91,9 +68,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -91,9 +68,9 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
end end
end end
describe '#failure_wait_time' do describe '#check_interval' do
it 'reads the value from settings' do it 'reads the value from settings' do
expect(circuit_breaker.failure_wait_time).to eq(1) expect(circuit_breaker.check_interval).to eq(1)
end end
end end
...@@ -114,12 +91,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -114,12 +91,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(circuit_breaker.access_retries).to eq(4) expect(circuit_breaker.access_retries).to eq(4)
end end
end end
describe '#backoff_threshold' do
it 'reads the value from settings' do
expect(circuit_breaker.backoff_threshold).to eq(5)
end
end
end end
describe '#perform' do describe '#perform' do
...@@ -134,19 +105,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -134,19 +105,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
end end
end end
it 'raises the correct exception when backing off' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
expect { |b| circuit_breaker.perform(&b) }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Failing)
expect(exception.retry_after).to eq(30)
end
end
end
it 'yields the block' do it 'yields the block' do
expect { |b| circuit_breaker.perform(&b) } expect { |b| circuit_breaker.perform(&b) }
.to yield_control .to yield_control
...@@ -170,54 +128,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -170,54 +128,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
.to raise_error(Rugged::OSError) .to raise_error(Rugged::OSError)
end end
it 'tracks that the storage was accessible' do
set_in_redis(:failure_count, 10)
set_in_redis(:last_failure, Time.now.to_f)
circuit_breaker.perform { '' }
expect(value_from_redis(:failure_count).to_i).to eq(0)
expect(value_from_redis(:last_failure)).to be_empty
expect(circuit_breaker.failure_count).to eq(0)
expect(circuit_breaker.last_failure).to be_nil
end
it 'maintains known storage keys' do
Timecop.freeze do
# Insert an old key to expire
old_entry = Time.now.to_i - 3.days.to_i
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, old_entry, 'to_be_removed')
end
circuit_breaker.perform { '' }
known_keys = Gitlab::Git::Storage.redis.with do |redis|
redis.zrange(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, -1)
end
expect(known_keys).to contain_exactly(cache_key)
end
end
it 'only performs the accessibility check once' do
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).once.and_call_original
2.times { circuit_breaker.perform { '' } }
end
it 'calls the check with the correct arguments' do
stub_application_setting(circuitbreaker_storage_timeout: 30,
circuitbreaker_access_retries: 3)
expect(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).with(TestEnv.repos_path, 30, 3)
.and_call_original
circuit_breaker.perform { '' }
end
context 'with the feature disabled' do context 'with the feature disabled' do
before do before do
stub_feature_flags(git_storage_circuit_breaker: false) stub_feature_flags(git_storage_circuit_breaker: false)
...@@ -240,31 +150,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -240,31 +150,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
expect(result).to eq('hello') expect(result).to eq('hello')
end end
end end
context 'the storage is not available' do
let(:storage_name) { 'broken' }
it 'raises the correct exception' do
expect(circuit_breaker).to receive(:track_storage_inaccessible)
expect { circuit_breaker.perform { '' } }
.to raise_error do |exception|
expect(exception).to be_kind_of(Gitlab::Git::Storage::Inaccessible)
expect(exception.retry_after).to eq(30)
end
end
it 'tracks that the storage was inaccessible' do
Timecop.freeze do
expect { circuit_breaker.perform { '' } }.to raise_error(Gitlab::Git::Storage::Inaccessible)
expect(value_from_redis(:failure_count).to_i).to eq(1)
expect(value_from_redis(:last_failure)).not_to be_empty
expect(circuit_breaker.failure_count).to eq(1)
expect(circuit_breaker.last_failure).to be_within(1.second).of(Time.now)
end
end
end
end end
describe '#circuit_broken?' do describe '#circuit_broken?' do
...@@ -283,32 +168,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state: ...@@ -283,32 +168,6 @@ describe Gitlab::Git::Storage::CircuitBreaker, clean_gitlab_redis_shared_state:
end end
end end
describe '#backing_off?' do
it 'is true when there was a recent failure' do
Timecop.freeze do
set_in_redis(:last_failure, 1.second.ago.to_f)
set_in_redis(:failure_count, 90)
expect(circuit_breaker.backing_off?).to be_truthy
end
end
context 'the `failure_wait_time` is set to 0' do
before do
stub_application_setting(circuitbreaker_failure_wait_time: 0)
end
it 'is working even when there are failures' do
Timecop.freeze do
set_in_redis(:last_failure, 0.seconds.ago.to_f)
set_in_redis(:failure_count, 90)
expect(circuit_breaker.backing_off?).to be_falsey
end
end
end
end
describe '#last_failure' do describe '#last_failure' do
it 'returns the last failure time' do it 'returns the last failure time' do
time = Time.parse("2017-05-26 17:52:30") time = Time.parse("2017-05-26 17:52:30")
......
require 'spec_helper'
describe Gitlab::Git::Storage::FailureInfo, :broken_storage do
let(:storage_name) { 'default' }
let(:hostname) { Gitlab::Environment.hostname }
let(:cache_key) { "storage_accessible:#{storage_name}:#{hostname}" }
def value_from_redis(name)
Gitlab::Git::Storage.redis.with do |redis|
redis.hmget(cache_key, name)
end.first
end
def set_in_redis(name, value)
Gitlab::Git::Storage.redis.with do |redis|
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hmset(cache_key, name, value)
end.first
end
describe '.reset_all!' do
it 'clears all entries form redis' do
set_in_redis(:failure_count, 10)
described_class.reset_all!
key_exists = Gitlab::Git::Storage.redis.with { |redis| redis.exists(cache_key) }
expect(key_exists).to be_falsey
end
it 'does not break when there are no keys in redis' do
expect { described_class.reset_all! }.not_to raise_error
end
end
describe '.load' do
it 'loads failure information for a storage on a host' do
first_failure = Time.parse("2017-11-14 17:52:30")
last_failure = Time.parse("2017-11-14 18:54:37")
failure_count = 11
set_in_redis(:first_failure, first_failure.to_i)
set_in_redis(:last_failure, last_failure.to_i)
set_in_redis(:failure_count, failure_count.to_i)
info = described_class.load(cache_key)
expect(info.first_failure).to eq(first_failure)
expect(info.last_failure).to eq(last_failure)
expect(info.failure_count).to eq(failure_count)
end
end
describe '#no_failures?' do
it 'is true when there are no failures' do
info = described_class.new(nil, nil, 0)
expect(info.no_failures?).to be_truthy
end
it 'is false when there are failures' do
info = described_class.new(Time.parse("2017-11-14 17:52:30"),
Time.parse("2017-11-14 18:54:37"),
20)
expect(info.no_failures?).to be_falsy
end
end
end
require 'spec_helper' require 'spec_helper'
describe Gitlab::Git::Storage::Health, clean_gitlab_redis_shared_state: true, broken_storage: true do describe Gitlab::Git::Storage::Health, broken_storage: true do
let(:host1_key) { 'storage_accessible:broken:web01' } let(:host1_key) { 'storage_accessible:broken:web01' }
let(:host2_key) { 'storage_accessible:default:kiq01' } let(:host2_key) { 'storage_accessible:default:kiq01' }
......
...@@ -27,7 +27,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do ...@@ -27,7 +27,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
end end
describe '#failure_info' do describe '#failure_info' do
it { Timecop.freeze { expect(breaker.failure_info).to eq(Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(Time.now, breaker.failure_count_threshold)) } } it { expect(breaker.failure_info.no_failures?).to be_falsy }
end end
end end
...@@ -49,7 +49,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do ...@@ -49,7 +49,7 @@ describe Gitlab::Git::Storage::NullCircuitBreaker do
end end
describe '#failure_info' do describe '#failure_info' do
it { expect(breaker.failure_info).to eq(Gitlab::Git::Storage::CircuitBreaker::FailureInfo.new(nil, 0)) } it { expect(breaker.failure_info.no_failures?).to be_truthy }
end end
end end
......
require 'spec_helper'
describe Gitlab::StorageCheck::CLI do
let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, 1, false) }
subject(:runner) { described_class.new(options) }
describe '#update_settings' do
it 'updates the interval when changed in a valid response and logs the change' do
fake_response = double
expect(fake_response).to receive(:valid?).and_return(true)
expect(fake_response).to receive(:check_interval).and_return(42)
expect(runner.logger).to receive(:info)
runner.update_settings(fake_response)
expect(options.interval).to eq(42)
end
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::GitlabCaller do
let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', nil, nil, false) }
subject(:gitlab_caller) { described_class.new(options) }
describe '#call!' do
context 'when a socket is given' do
it 'calls a socket' do
fake_connection = double
expect(fake_connection).to receive(:post)
expect(Excon).to receive(:new).with('unix://tmp/socket.sock', socket: "tmp/socket.sock") { fake_connection }
gitlab_caller.call!
end
end
context 'when a host is given' do
let(:options) { Gitlab::StorageCheck::Options.new('http://localhost:8080', nil, nil, false) }
it 'it calls a http response' do
fake_connection = double
expect(Excon).to receive(:new).with('http://localhost:8080', socket: nil) { fake_connection }
expect(fake_connection).to receive(:post)
gitlab_caller.call!
end
end
end
describe '#headers' do
it 'Adds the JSON header' do
headers = gitlab_caller.headers
expect(headers['Content-Type']).to eq('application/json')
end
context 'when a token was provided' do
let(:options) { Gitlab::StorageCheck::Options.new('unix://tmp/socket.sock', 'atoken', nil, false) }
it 'adds it to the headers' do
expect(gitlab_caller.headers['TOKEN']).to eq('atoken')
end
end
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::OptionParser do
describe '.parse!' do
it 'assigns all options' do
args = %w(--target unix://tmp/hello/world.sock --token thetoken --interval 42)
options = described_class.parse!(args)
expect(options.token).to eq('thetoken')
expect(options.interval).to eq(42)
expect(options.target).to eq('unix://tmp/hello/world.sock')
end
it 'requires the interval to be a number' do
args = %w(--target unix://tmp/hello/world.sock --interval fortytwo)
expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument)
end
it 'raises an error if the scheme is not included' do
args = %w(--target tmp/hello/world.sock)
expect { described_class.parse!(args) }.to raise_error(OptionParser::InvalidArgument)
end
it 'raises an error if both socket and host are missing' do
expect { described_class.parse!([]) }.to raise_error(OptionParser::InvalidArgument)
end
end
end
require 'spec_helper'
describe Gitlab::StorageCheck::Response do
let(:fake_json) do
{
check_interval: 42,
results: [
{ storage: 'working', success: true },
{ storage: 'skipped', success: nil },
{ storage: 'failing', success: false }
]
}.to_json
end
let(:fake_http_response) do
fake_response = instance_double("Excon::Response - Status check")
allow(fake_response).to receive(:status).and_return(200)
allow(fake_response).to receive(:body).and_return(fake_json)
allow(fake_response).to receive(:headers).and_return('Content-Type' => 'application/json')
fake_response
end
let(:response) { described_class.new(fake_http_response) }
describe '#valid?' do
it 'is valid for a success response with parseable JSON' do
expect(response).to be_valid
end
end
describe '#check_interval' do
it 'returns the result from the JSON' do
expect(response.check_interval).to eq(42)
end
end
describe '#responsive_shards' do
it 'contains the names of working shards' do
expect(response.responsive_shards).to contain_exactly('working')
end
end
describe '#skipped_shards' do
it 'contains the names of skipped shards' do
expect(response.skipped_shards).to contain_exactly('skipped')
end
end
describe '#failing_shards' do
it 'contains the name of failing shards' do
expect(response.failing_shards).to contain_exactly('failing')
end
end
end
...@@ -115,9 +115,8 @@ describe ApplicationSetting do ...@@ -115,9 +115,8 @@ describe ApplicationSetting do
end end
context 'circuitbreaker settings' do context 'circuitbreaker settings' do
[:circuitbreaker_backoff_threshold, [:circuitbreaker_failure_count_threshold,
:circuitbreaker_failure_count_threshold, :circuitbreaker_check_interval,
:circuitbreaker_failure_wait_time,
:circuitbreaker_failure_reset_time, :circuitbreaker_failure_reset_time,
:circuitbreaker_storage_timeout].each do |field| :circuitbreaker_storage_timeout].each do |field|
it "Validates #{field} as number" do it "Validates #{field} as number" do
...@@ -126,16 +125,6 @@ describe ApplicationSetting do ...@@ -126,16 +125,6 @@ describe ApplicationSetting do
.is_greater_than_or_equal_to(0) .is_greater_than_or_equal_to(0)
end end
end end
it 'requires the `backoff_threshold` to be lower than the `failure_count_threshold`' do
setting.circuitbreaker_failure_count_threshold = 10
setting.circuitbreaker_backoff_threshold = 15
failure_message = "The circuitbreaker backoff threshold should be lower "\
"than the failure count threshold"
expect(setting).not_to be_valid
expect(setting.errors[:circuitbreaker_backoff_threshold]).to include(failure_message)
end
end end
context 'repository storages' do context 'repository storages' do
......
...@@ -29,7 +29,9 @@ describe Repository do ...@@ -29,7 +29,9 @@ describe Repository do
def expect_to_raise_storage_error def expect_to_raise_storage_error
expect { yield }.to raise_error do |exception| expect { yield }.to raise_error do |exception|
storage_exceptions = [Gitlab::Git::Storage::Inaccessible, Gitlab::Git::CommandError, GRPC::Unavailable] storage_exceptions = [Gitlab::Git::Storage::Inaccessible, Gitlab::Git::CommandError, GRPC::Unavailable]
expect(exception.class).to be_in(storage_exceptions) known_exception = storage_exceptions.select { |e| exception.is_a?(e) }
expect(known_exception).not_to be_nil
end end
end end
...@@ -634,9 +636,7 @@ describe Repository do ...@@ -634,9 +636,7 @@ describe Repository do
end end
describe '#fetch_ref' do describe '#fetch_ref' do
# Setting the var here, sidesteps the stub that makes gitaly raise an error let(:broken_repository) { create(:project, :broken_storage).repository }
# before the actual test call
set(:broken_repository) { create(:project, :broken_storage).repository }
describe 'when storage is broken', :broken_storage do describe 'when storage is broken', :broken_storage do
it 'should raise a storage error' do it 'should raise a storage error' do
......
...@@ -47,7 +47,7 @@ describe API::CircuitBreakers do ...@@ -47,7 +47,7 @@ describe API::CircuitBreakers do
describe 'DELETE circuit_breakers/repository_storage' do describe 'DELETE circuit_breakers/repository_storage' do
it 'clears all circuit_breakers' do it 'clears all circuit_breakers' do
expect(Gitlab::Git::Storage::CircuitBreaker).to receive(:reset_all!) expect(Gitlab::Git::Storage::FailureInfo).to receive(:reset_all!)
delete api('/circuit_breakers/repository_storage', admin) delete api('/circuit_breakers/repository_storage', admin)
......
...@@ -54,7 +54,7 @@ describe API::Settings, 'Settings' do ...@@ -54,7 +54,7 @@ describe API::Settings, 'Settings' do
dsa_key_restriction: 2048, dsa_key_restriction: 2048,
ecdsa_key_restriction: 384, ecdsa_key_restriction: 384,
ed25519_key_restriction: 256, ed25519_key_restriction: 256,
circuitbreaker_failure_wait_time: 2 circuitbreaker_check_interval: 2
expect(response).to have_gitlab_http_status(200) expect(response).to have_gitlab_http_status(200)
expect(json_response['default_projects_limit']).to eq(3) expect(json_response['default_projects_limit']).to eq(3)
...@@ -75,7 +75,7 @@ describe API::Settings, 'Settings' do ...@@ -75,7 +75,7 @@ describe API::Settings, 'Settings' do
expect(json_response['dsa_key_restriction']).to eq(2048) expect(json_response['dsa_key_restriction']).to eq(2048)
expect(json_response['ecdsa_key_restriction']).to eq(384) expect(json_response['ecdsa_key_restriction']).to eq(384)
expect(json_response['ed25519_key_restriction']).to eq(256) expect(json_response['ed25519_key_restriction']).to eq(256)
expect(json_response['circuitbreaker_failure_wait_time']).to eq(2) expect(json_response['circuitbreaker_check_interval']).to eq(2)
end end
end end
......
...@@ -121,18 +121,6 @@ RSpec.configure do |config| ...@@ -121,18 +121,6 @@ RSpec.configure do |config|
reset_delivered_emails! reset_delivered_emails!
end end
# Stub the `ForkedStorageCheck.storage_available?` method unless
# `:broken_storage` metadata is defined
#
# This check can be slow and is unnecessary in a test environment where we
# know the storage is available, because we create it at runtime
config.before(:example) do |example|
unless example.metadata[:broken_storage]
allow(Gitlab::Git::Storage::ForkedStorageCheck)
.to receive(:storage_available?).and_return(true)
end
end
config.around(:each, :use_clean_rails_memory_store_caching) do |example| config.around(:each, :use_clean_rails_memory_store_caching) do |example|
caching_store = Rails.cache caching_store = Rails.cache
Rails.cache = ActiveSupport::Cache::MemoryStore.new Rails.cache = ActiveSupport::Cache::MemoryStore.new
......
...@@ -12,6 +12,25 @@ RSpec.configure do |config| ...@@ -12,6 +12,25 @@ RSpec.configure do |config|
raise GRPC::Unavailable.new('Gitaly broken in this spec') raise GRPC::Unavailable.new('Gitaly broken in this spec')
end end
Gitlab::Git::Storage::CircuitBreaker.reset_all! # Track the maximum number of failures
first_failure = Time.parse("2017-11-14 17:52:30")
last_failure = Time.parse("2017-11-14 18:54:37")
failure_count = Gitlab::CurrentSettings
.current_application_settings
.circuitbreaker_failure_count_threshold + 1
cache_key = "#{Gitlab::Git::Storage::REDIS_KEY_PREFIX}broken:#{Gitlab::Environment.hostname}"
Gitlab::Git::Storage.redis.with do |redis|
redis.pipelined do
redis.zadd(Gitlab::Git::Storage::REDIS_KNOWN_KEYS, 0, cache_key)
redis.hset(cache_key, :first_failure, first_failure.to_i)
redis.hset(cache_key, :last_failure, last_failure.to_i)
redis.hset(cache_key, :failure_count, failure_count.to_i)
end
end
end
config.after(:each, :broken_storage) do
Gitlab::Git::Storage.redis.with(&:flushall)
end end
end end
...@@ -43,6 +43,8 @@ module StubConfiguration ...@@ -43,6 +43,8 @@ module StubConfiguration
end end
def stub_storage_settings(messages) def stub_storage_settings(messages)
messages.deep_stringify_keys!
# Default storage is always required # Default storage is always required
messages['default'] ||= Gitlab.config.repositories.storages.default messages['default'] ||= Gitlab.config.repositories.storages.default
messages.each do |storage_name, storage_settings| messages.each do |storage_name, storage_settings|
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment