Commit 842bcfa7 authored by Douwe Maan's avatar Douwe Maan

Merge branch 'pawel/ensure_temp_files_are_deleted_in_fs_metrics-35457' into 'master'

Ensure test files are deleted after fs metrics gathering run

Closes #35457

See merge request !13080
parents 0cf5a9cf 9be17322
---
title: Ensure filesystem metrics test files are deleted
merge_request:
author:
......@@ -6,7 +6,7 @@ Prometheus::Client.configure do |config|
config.initial_mmap_file_size = 4 * 1024
config.multiprocess_files_dir = ENV['prometheus_multiproc_dir']
if Rails.env.development? && Rails.env.test?
if Rails.env.development? || Rails.env.test?
config.multiprocess_files_dir ||= Rails.root.join('tmp/prometheus_multiproc_dir')
end
end
......@@ -27,10 +27,10 @@ module Gitlab
Metric.new(name, value, labels)
end
def with_timing(proc)
def with_timing
start = Time.now
result = proc.call
yield result, Time.now.to_f - start.to_f
result = yield
[result, Time.now.to_f - start.to_f]
end
def catch_timeout(seconds, &block)
......
......@@ -10,47 +10,45 @@ module Gitlab
def readiness
repository_storages.map do |storage_name|
begin
tmp_file_path = tmp_file_path(storage_name)
if !storage_stat_test(storage_name)
HealthChecks::Result.new(false, 'cannot stat storage', shard: storage_name)
elsif !storage_write_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
elsif !storage_read_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
else
HealthChecks::Result.new(true, nil, shard: storage_name)
with_temp_file(storage_name) do |tmp_file_path|
if !storage_write_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot write to storage', shard: storage_name)
elsif !storage_read_test(tmp_file_path)
HealthChecks::Result.new(false, 'cannot read from storage', shard: storage_name)
else
HealthChecks::Result.new(true, nil, shard: storage_name)
end
end
end
rescue RuntimeError => ex
message = "unexpected error #{ex} when checking storage #{storage_name}"
Rails.logger.error(message)
HealthChecks::Result.new(false, message, shard: storage_name)
ensure
delete_test_file(tmp_file_path)
end
end
end
def metrics
repository_storages.flat_map do |storage_name|
tmp_file_path = tmp_file_path(storage_name)
[
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, -> { storage_stat_test(storage_name) }, shard: storage_name),
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, -> { storage_write_test(tmp_file_path) }, shard: storage_name),
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, -> { storage_read_test(tmp_file_path) }, shard: storage_name)
storage_stat_metrics(storage_name),
storage_write_metrics(storage_name),
storage_read_metrics(storage_name)
].flatten
end
end
private
def operation_metrics(ok_metric, latency_metric, operation, **labels)
with_timing operation do |result, elapsed|
[
metric(latency_metric, elapsed, **labels),
metric(ok_metric, result ? 1 : 0, **labels)
]
end
def operation_metrics(ok_metric, latency_metric, **labels)
result, elapsed = yield
[
metric(latency_metric, elapsed, **labels),
metric(ok_metric, result ? 1 : 0, **labels)
]
rescue RuntimeError => ex
Rails.logger.error("unexpected error #{ex} when checking #{ok_metric}")
[metric(ok_metric, 0, **labels)]
......@@ -68,19 +66,36 @@ module Gitlab
Gitlab::Popen.popen([TIMEOUT_EXECUTABLE, COMMAND_TIMEOUT].concat(cmd_args), *args, &block)
end
def tmp_file_path(storage_name)
Dir::Tmpname.create(%w(fs_shards_check +deleted), path(storage_name)) { |path| path }
def with_temp_file(storage_name)
temp_file_path = Dir::Tmpname.create(%w(fs_shards_check +deleted), storage_path(storage_name)) { |path| path }
yield temp_file_path
ensure
delete_test_file(temp_file_path)
end
def path(storage_name)
def storage_path(storage_name)
storages_paths&.dig(storage_name, 'path')
end
# All below test methods use shell commands to perform actions on storage volumes.
# In case a storage volume have connectivity problems causing pure Ruby IO operation to wait indefinitely,
# we can rely on shell commands to be terminated once `timeout` kills them.
#
# However we also fallback to pure Ruby file operations in case a specific shell command is missing
# so we are still able to perform healthchecks and gather metrics from such system.
def delete_test_file(tmp_path)
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
status.zero?
rescue Errno::ENOENT
File.delete(tmp_path) rescue Errno::ENOENT
end
def storage_stat_test(storage_name)
stat_path = File.join(path(storage_name), '.')
stat_path = File.join(storage_path(storage_name), '.')
begin
_, status = exec_with_timeout(%W{ stat #{stat_path} })
status == 0
status.zero?
rescue Errno::ENOENT
File.exist?(stat_path) && File::Stat.new(stat_path).readable?
end
......@@ -90,7 +105,7 @@ module Gitlab
_, status = exec_with_timeout(%W{ tee #{tmp_path} }) do |stdin|
stdin.write(RANDOM_STRING)
end
status == 0
status.zero?
rescue Errno::ENOENT
written_bytes = File.write(tmp_path, RANDOM_STRING) rescue Errno::ENOENT
written_bytes == RANDOM_STRING.length
......@@ -100,17 +115,33 @@ module Gitlab
_, status = exec_with_timeout(%W{ diff #{tmp_path} - }) do |stdin|
stdin.write(RANDOM_STRING)
end
status == 0
status.zero?
rescue Errno::ENOENT
file_contents = File.read(tmp_path) rescue Errno::ENOENT
file_contents == RANDOM_STRING
end
def delete_test_file(tmp_path)
_, status = exec_with_timeout(%W{ rm -f #{tmp_path} })
status == 0
rescue Errno::ENOENT
File.delete(tmp_path) rescue Errno::ENOENT
def storage_stat_metrics(storage_name)
operation_metrics(:filesystem_accessible, :filesystem_access_latency_seconds, shard: storage_name) do
with_timing { storage_stat_test(storage_name) }
end
end
def storage_write_metrics(storage_name)
operation_metrics(:filesystem_writable, :filesystem_write_latency_seconds, shard: storage_name) do
with_temp_file(storage_name) do |tmp_file_path|
with_timing { storage_write_test(tmp_file_path) }
end
end
end
def storage_read_metrics(storage_name)
operation_metrics(:filesystem_readable, :filesystem_read_latency_seconds, shard: storage_name) do
with_temp_file(storage_name) do |tmp_file_path|
storage_write_test(tmp_file_path) # writes data used by read test
with_timing { storage_read_test(tmp_file_path) }
end
end
end
end
end
......
......@@ -15,14 +15,13 @@ module Gitlab
end
def metrics
with_timing method(:check) do |result, elapsed|
Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
[
metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
metric("#{metric_prefix}_latency_seconds", elapsed)
]
end
result, elapsed = with_timing(&method(:check))
Rails.logger.error("#{human_name} check returned unexpected result #{result}") unless is_successful?(result)
[
metric("#{metric_prefix}_timeout", result.is_a?(Timeout::Error) ? 1 : 0),
metric("#{metric_prefix}_success", is_successful?(result) ? 1 : 0),
metric("#{metric_prefix}_latency_seconds", elapsed)
]
end
private
......
......@@ -3,7 +3,7 @@ require 'spec_helper'
describe Gitlab::HealthChecks::FsShardsCheck do
def command_exists?(command)
_, status = Gitlab::Popen.popen(%W{ #{command} 1 echo })
status == 0
status.zero?
rescue Errno::ENOENT
false
end
......@@ -64,9 +64,7 @@ describe Gitlab::HealthChecks::FsShardsCheck do
it 'cleans up files used for testing' do
expect(described_class).to receive(:storage_write_test).with(any_args).and_call_original
subject
expect(Dir.entries(tmp_dir).count).to eq(2)
expect { subject }.not_to change(Dir.entries(tmp_dir), :count)
end
context 'read test fails' do
......@@ -88,8 +86,6 @@ describe Gitlab::HealthChecks::FsShardsCheck do
end
describe '#metrics' do
subject { described_class.metrics }
context 'storage points to not existing folder' do
let(:storages_paths) do
{
......@@ -104,14 +100,15 @@ describe Gitlab::HealthChecks::FsShardsCheck do
end
it 'provides metrics' do
expect(subject).to all(have_attributes(labels: { shard: :default }))
expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: :default }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
end
end
......@@ -121,15 +118,19 @@ describe Gitlab::HealthChecks::FsShardsCheck do
end
it 'provides metrics' do
expect(subject).to all(have_attributes(labels: { shard: :default }))
expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1))
expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 1))
expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 1))
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: :default }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 1))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
end
expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
it 'cleans up files used for metrics' do
expect { described_class.metrics }.not_to change(Dir.entries(tmp_dir), :count)
end
end
end
......@@ -150,18 +151,16 @@ describe Gitlab::HealthChecks::FsShardsCheck do
end
describe '#metrics' do
subject { described_class.metrics }
it 'provides metrics' do
expect(subject).to all(have_attributes(labels: { shard: :default }))
expect(subject).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(subject).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
metrics = described_class.metrics
expect(metrics).to all(have_attributes(labels: { shard: :default }))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_accessible, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_readable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_writable, value: 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_access_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_read_latency_seconds, value: be >= 0))
expect(metrics).to include(an_object_having_attributes(name: :filesystem_write_latency_seconds, value: be >= 0))
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment