Commit 8ac9038e authored by Nick Thomas's avatar Nick Thomas

Merge branch 'zj-gitaly-read-write-check' into 'master'

Gitaly metrics check for read/writeability

Closes gitaly#1218

See merge request gitlab-org/gitlab-ce!20022
parents a15f2158 65840591
......@@ -418,7 +418,7 @@ group :ed25519 do
end
# Gitaly GRPC client
gem 'gitaly-proto', '~> 0.102.0', require: 'gitaly'
gem 'gitaly-proto', '~> 0.103.0', require: 'gitaly'
gem 'grpc', '~> 1.11.0'
# Locked until https://github.com/google/protobuf/issues/4210 is closed
......
......@@ -282,7 +282,7 @@ GEM
gettext_i18n_rails (>= 0.7.1)
po_to_json (>= 1.0.0)
rails (>= 3.2.0)
gitaly-proto (0.102.0)
gitaly-proto (0.103.0)
google-protobuf (~> 3.1)
grpc (~> 1.10)
github-linguist (5.3.3)
......@@ -1041,7 +1041,7 @@ DEPENDENCIES
gettext (~> 3.2.2)
gettext_i18n_rails (~> 1.8.0)
gettext_i18n_rails_js (~> 1.3)
gitaly-proto (~> 0.102.0)
gitaly-proto (~> 0.103.0)
github-linguist (~> 5.3.3)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-gollum-lib (~> 4.2)
......
......@@ -285,7 +285,7 @@ GEM
gettext_i18n_rails (>= 0.7.1)
po_to_json (>= 1.0.0)
rails (>= 3.2.0)
gitaly-proto (0.102.0)
gitaly-proto (0.103.0)
google-protobuf (~> 3.1)
grpc (~> 1.10)
github-linguist (5.3.3)
......@@ -1051,7 +1051,7 @@ DEPENDENCIES
gettext (~> 3.2.2)
gettext_i18n_rails (~> 1.8.0)
gettext_i18n_rails_js (~> 1.3)
gitaly-proto (~> 0.102.0)
gitaly-proto (~> 0.103.0)
github-linguist (~> 5.3.3)
gitlab-flowdock-git-hook (~> 1.0.1)
gitlab-gollum-lib (~> 4.2)
......
......@@ -8,7 +8,6 @@ class HealthController < ActionController::Base
Gitlab::HealthChecks::Redis::CacheCheck,
Gitlab::HealthChecks::Redis::QueuesCheck,
Gitlab::HealthChecks::Redis::SharedStateCheck,
Gitlab::HealthChecks::FsShardsCheck,
Gitlab::HealthChecks::GitalyCheck
].freeze
......
......@@ -6,7 +6,8 @@ class MetricsService
Gitlab::HealthChecks::Redis::RedisCheck,
Gitlab::HealthChecks::Redis::CacheCheck,
Gitlab::HealthChecks::Redis::QueuesCheck,
Gitlab::HealthChecks::Redis::SharedStateCheck
Gitlab::HealthChecks::Redis::SharedStateCheck,
Gitlab::HealthChecks::GitalyCheck
].freeze
def prometheus_metrics_text
......
---
title: Gitaly metrics check for read/writeability
merge_request: 20022
author:
type: other
......@@ -22,6 +22,18 @@ module Gitaly
server_version == Gitlab::GitalyClient.expected_server_version
end
def read_writeable?
readable? && writeable?
end
def readable?
storage_status&.readable
end
def writeable?
storage_status&.writeable
end
def address
Gitlab::GitalyClient.address(@storage)
rescue RuntimeError => e
......@@ -30,13 +42,17 @@ module Gitaly
private
def storage_status
@storage_status ||= info.storage_statuses.find { |s| s.storage_name == storage }
end
def info
@info ||=
begin
Gitlab::GitalyClient::ServerService.new(@storage).info
rescue GRPC::Unavailable, GRPC::GRPC::DeadlineExceeded
# This will show the server as being out of date
Gitaly::ServerInfoResponse.new(git_version: '', server_version: '')
Gitaly::ServerInfoResponse.new(git_version: '', server_version: '', storage_statuses: [])
end
end
end
......
module Gitlab
module HealthChecks
# Gitaly migration: https://gitlab.com/gitlab-org/gitaly/issues/1218
class FsShardsCheck
extend BaseAbstractCheck
RANDOM_STRING = SecureRandom.hex(1000).freeze
COMMAND_TIMEOUT = '1'.freeze
TIMEOUT_EXECUTABLE = 'timeout'.freeze
class << self
def readiness
repository_storages.map do |storage_name|
begin
if !storage_circuitbreaker_test(storage_name)
HealthChecks::Result.new(false, 'circuitbreaker tripped', shard: storage_name)
elsif !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
else
with_temp_file(storage_name) do |tmp_file_path|
if !storage_write_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
elsif !storage_read_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
else
HealthChecks::Result.new(true, nil, shard: storage_name)
end
end
end
rescue RuntimeError => ex
message = "unexpected error #{ex} when checking storage #{storage_name}"
Rails.logger.error(message)
HealthChecks::Result.new(false, message, shard: storage_name)
end
end
end
def metrics
repository_storages.flat_map do |storage_name|
[
storage_stat_metrics(storage_name),
storage_write_metrics(storage_name),
storage_read_metrics(storage_name),
storage_circuitbreaker_metrics(storage_name)
].flatten
end
end
private
def operation_metrics(ok_metric, latency_metric, **labels)
result, elapsed = yield
[
metric(latency_metric, elapsed, **labels),
metric(ok_metric, result ? 1 : 0, **labels)
]
rescue RuntimeError => ex
Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
[metric(ok_metric, 0, **labels)]
end
def repository_storages
storages_paths.keys
end
def storages_paths
Gitlab.config.repositories.storages
end
def exec_with_timeout(cmd_args, *args, &block)
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
end
def with_temp_file(storage_name)
temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
yield temp_file_path
ensure
delete_test_file(temp_file_path)
end
def storage_path(storage_name)
Gitlab::GitalyClient::StorageSettings.allow_disk_access do
storages_paths[storage_name]&.legacy_disk_path
end
end
# All below test methods use shell commands to perform actions on storage volumes.
# In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
# we can rely on shell commands to be terminated once `timeout` kills them.
#
# However we also fallback to pure Ruby file operations in case a specific shell command is missing
# so we are still able to perform healthchecks and gather metrics from such system.
def delete_test_file(tmp_path)
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
status.zero?
rescue Errno::ENOENT
File.delete(tmp_path) rescue Errno::ENOENT
end
def storage_stat_test(storage_name)
stat_path = File.join(storage_path(storage_name), '.')
begin
_, status = exec_with_timeout(%W{ stat #{stat_path} })
status.zero?
rescue Errno::ENOENT
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
end
end
def storage_write_test(tmp_path)
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
stdin.write(RANDOM_STRING)
end
status.zero?
rescue Errno::ENOENT
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
written_bytes == RANDOM_STRING.length
end
def storage_read_test(tmp_path)
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
stdin.write(RANDOM_STRING)
end
status.zero?
rescue Errno::ENOENT
file_contents = File.read(tmp_path) rescue Errno::ENOENT
file_contents == RANDOM_STRING
end
def storage_circuitbreaker_test(storage_name)
Gitlab::Git::Storage::CircuitBreaker.build(storage_name).perform { "OK" }
rescue Gitlab::Git::Storage::Inaccessible
nil
end
def storage_stat_metrics(storage_name)
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
with_timing { storage_stat_test(storage_name) }
end
end
def storage_write_metrics(storage_name)
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
with_temp_file(storage_name) do |tmp_file_path|
with_timing { storage_write_test(tmp_file_path) }
end
end
end
def storage_read_metrics(storage_name)
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
with_temp_file(storage_name) do |tmp_file_path|
storage_write_test(tmp_file_path) # writes data used by read test
with_timing { storage_read_test(tmp_file_path) }
end
end
end
def storage_circuitbreaker_metrics(storage_name)
operation_metrics(:filesystem_circuitbreaker,
:filesystem_circuitbreaker_latency_seconds,
shard: storage_name) do
with_timing { storage_circuitbreaker_test(storage_name) }
end
end
end
end
end
end
......@@ -13,14 +13,14 @@ module Gitlab
end
def metrics
repository_storages.flat_map do |storage_name|
result, elapsed = with_timing { check(storage_name) }
labels = { shard: storage_name }
Gitaly::Server.all.flat_map do |server|
result, elapsed = with_timing { server.read_writeable? }
labels = { shard: server.storage }
[
metric("#{metric_prefix}_success", successful?(result) ? 1 : 0, **labels),
metric("#{metric_prefix}_success", result ? 1 : 0, **labels),
metric("#{metric_prefix}_latency_seconds", elapsed, **labels)
].flatten
]
end
end
......@@ -36,10 +36,6 @@ module Gitlab
METRIC_PREFIX
end
def successful?(result)
result[:success]
end
def repository_storages
storages.keys
end
......
......@@ -69,8 +69,7 @@ describe HealthController do
expect(json_response['cache_check']['status']).to eq('ok')
expect(json_response['queues_check']['status']).to eq('ok')
expect(json_response['shared_state_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['labels']['shard']).to eq('default')
expect(json_response['gitaly_check']['status']).to eq('ok')
end
end
......@@ -122,7 +121,6 @@ describe HealthController do
expect(json_response['cache_check']['status']).to eq('ok')
expect(json_response['queues_check']['status']).to eq('ok')
expect(json_response['shared_state_check']['status']).to eq('ok')
expect(json_response['fs_shards_check']['status']).to eq('ok')
end
end
......
......@@ -59,6 +59,13 @@ describe MetricsController do
expect(response.body).to match(/^redis_shared_state_ping_latency_seconds [0-9\.]+$/)
end
it 'returns Gitaly metrics' do
get :index
expect(response.body).to match(/^gitaly_health_check_success{shard="default"} 1$/)
expect(response.body).to match(/^gitaly_health_check_latency_seconds{shard="default"} [0-9\.]+$/)
end
context 'prometheus metrics are disabled' do
before do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(false)
......
require 'spec_helper'
describe Gitaly::Server do
let(:server) { described_class.new('default') }
describe '.all' do
let(:storages) { Gitlab.config.repositories.storages }
......@@ -17,6 +19,38 @@ describe Gitaly::Server do
it { is_expected.to respond_to(:up_to_date?) }
it { is_expected.to respond_to(:address) }
describe 'readable?' do
context 'when the storage is readable' do
it 'returns true' do
expect(server).to be_readable
end
end
context 'when the storage is not readable' do
let(:server) { described_class.new('broken') }
it 'returns false' do
expect(server).not_to be_readable
end
end
end
describe 'writeable?' do
context 'when the storage is writeable' do
it 'returns true' do
expect(server).to be_writeable
end
end
context 'when the storage is not writeable' do
let(:server) { described_class.new('broken') }
it 'returns false' do
expect(server).not_to be_writeable
end
end
end
describe 'request memoization' do
context 'when requesting multiple properties', :request_store do
it 'uses memoization for the info request' do
......
require 'spec_helper'
describe Gitlab::HealthChecks::FsShardsCheck do
def command_exists?(command)
_, status = Gitlab::Popen.popen(%W{ #{command} 1 echo })
status.zero?
rescue Errno::ENOENT
false
end
def timeout_command
@timeout_command ||=
if command_exists?('timeout')
'timeout'
elsif command_exists?('gtimeout')
'gtimeout'
else
''
end
end
let(:metric_class) { Gitlab::HealthChecks::Metric }
let(:result_class) { Gitlab::HealthChecks::Result }
let(:repository_storages) { ['default'] }
let(:tmp_dir) { Dir.mktmpdir }
let(:storages_paths) do
{
default: Gitlab::GitalyClient::StorageSettings.new('path' => tmp_dir)
}.with_indifferent_access
end
before do
allow(described_class).to receive(:repository_storages) { repository_storages }
allow(described_class).to receive(:storages_paths) { storages_paths }
stub_const('Gitlab::HealthChecks::FsShardsCheck::TIMEOUT_EXECUTABLE', timeout_command)
end
after do
FileUtils.remove_entry_secure(tmp_dir) if Dir.exist?(tmp_dir)
end
shared_examples 'filesystem checks' do
describe '#readiness' do
subject { described_class.readiness }
context 'storage has a tripped circuitbreaker', :broken_storage do
let(:repository_storages) { ['broken'] }
let(:storages_paths) do
Gitlab.config.repositories.storages
end
it { is_expected.to include(result_class.new(false, 'circuitbreaker tripped', shard: 'broken')) }
end
context 'storage points to not existing folder' do
let(:storages_paths) do
{
default: Gitlab::GitalyClient::StorageSettings.new('path' => 'tmp/this/path/doesnt/exist')
}.with_indifferent_access
end
before do
allow(described_class).to receive(:storage_circuitbreaker_test) { true }
end
it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: 'default')) }
end
context 'storage points to directory that has both read and write rights' do
before do
FileUtils.chmod_R(0755, tmp_dir)
end
it { is_expected.to include(result_class.new(true, nil, shard: 'default')) }
it 'cleans up files used for testing' do
expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original
expect { subject }.not_to change(Dir.entries(tmp_dir), :count)
end
context 'read test fails' do
before do
allow(described_class).to receive(:storage_read_test).with(any_args).and_return(false)
end
it { is_expected.to include(result_class.new(false, 'cannot read from storage', shard: 'default')) }
end
context 'write test fails' do
before do
allow(described_class).to receive(:storage_write_test).with(any_args).and_return(false)
end
it { is_expected.to include(result_class.new(false, 'cannot write to storage', shard: 'default')) }
end
end
end
describe '#metrics' do
context 'storage points to not existing folder' do
let(:storages_paths) do
{
default: Gitlab::GitalyClient::StorageSettings.new('path' => 'tmp/this/path/doesnt/exist')
}.with_indifferent_access
end
it 'provides metrics' do
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: 'default' }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
end
end
context 'storage points to directory that has both read and write rights' do
before do
FileUtils.chmod_R(0755, tmp_dir)
end
it 'provides metrics' do
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: 'default' }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_circuitbreaker_latency_seconds, value: be >= 0))
end
it 'cleans up files used for metrics' do
expect { described_class.metrics }.not_to change(Dir.entries(tmp_dir), :count)
end
end
end
end
context 'when timeout kills fs checks' do
before do
stub_const('Gitlab::HealthChecks::FsShardsCheck::COMMAND_TIMEOUT', '1')
allow(described_class).to receive(:exec_with_timeout).and_wrap_original { |m| m.call(%w(sleep 60)) }
FileUtils.chmod_R(0755, tmp_dir)
end
describe '#readiness' do
subject { described_class.readiness }
it { is_expected.to include(result_class.new(false, 'cannot stat storage', shard: 'default')) }
end
describe '#metrics' do
it 'provides metrics' do
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: 'default' }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
end
end
end
context 'when popen always finds required binaries' do
before do
allow(described_class).to receive(:exec_with_timeout).and_wrap_original do |method, *args, &block|
begin
method.call(*args, &block)
rescue RuntimeError, Errno::ENOENT
raise 'expected not to happen'
end
end
stub_const('Gitlab::HealthChecks::FsShardsCheck::COMMAND_TIMEOUT', '10')
end
it_behaves_like 'filesystem checks'
end
context 'when popen never finds required binaries' do
before do
allow(Gitlab::Popen).to receive(:popen).and_raise(Errno::ENOENT)
end
it_behaves_like 'filesystem checks'
end
end
......@@ -30,13 +30,14 @@ describe Gitlab::HealthChecks::GitalyCheck do
describe '#metrics' do
subject { described_class.metrics }
let(:server) { double(storage: 'default', read_writeable?: up) }
before do
expect(Gitlab::GitalyClient::HealthCheckService).to receive(:new).and_return(gitaly_check)
allow(Gitaly::Server).to receive(:new).and_return(server)
end
context 'Gitaly server is up' do
let(:gitaly_check) { double(check: { success: true }) }
let(:up) { true }
it 'provides metrics' do
expect(subject).to all(have_attributes(labels: { shard: 'default' }))
......@@ -46,7 +47,7 @@ describe Gitlab::HealthChecks::GitalyCheck do
end
context 'Gitaly server is down' do
let(:gitaly_check) { double(check: { success: false, message: 'Connection refused' }) }
let(:up) { false }
it 'provides metrics' do
expect(subject).to include(an_object_having_attributes(name: 'gitaly_health_check_success', value: 0))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment