Commit 123f70cc authored by Peter Leitzen's avatar Peter Leitzen

Merge branch '232786-respect-stop-query-failures' into 'master'

Skip subsequent topology Prometheus queries if timeout occur

See merge request gitlab-org/gitlab!38293
parents 5eae1765 32958f14
---
title: Skip subsequent topology Prometheus queries if timeout occur
merge_request: 38293
author:
type: performance
......@@ -17,6 +17,9 @@ module Gitlab
'registry' => 'registry'
}.freeze
# If these errors occur, all subsequent queries are likely to fail for the same error
TIMEOUT_ERRORS = [Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout].freeze
CollectionFailure = Struct.new(:query, :error) do
def to_h
{ query => error }
......@@ -158,6 +161,11 @@ module Gitlab
end
def query_safely(query, query_name, fallback:)
if timeout_error_exists?
@failures << CollectionFailure.new(query_name, 'timeout_cancellation')
return fallback
end
result = yield query
return result if result.present?
......@@ -169,6 +177,14 @@ module Gitlab
fallback
end
def timeout_error_exists?
timeout_error_names = TIMEOUT_ERRORS.map(&:to_s).to_set
@failures.any? do |failure|
timeout_error_names.include?(failure.error)
end
end
def topology_node_services(instance, all_process_counts, all_process_memory, all_server_types)
# returns all node service data grouped by service name as the key
instance_service_data =
......
......@@ -402,7 +402,8 @@ RSpec.describe Gitlab::UsageData::Topology do
end
context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do
context 'without timeout failures' do
it 'returns empty result and executes subsequent queries as usual' do
expect_prometheus_api_to receive(:query)
.at_least(:once)
.and_raise(Gitlab::PrometheusClient::ConnectionError)
......@@ -426,6 +427,38 @@ RSpec.describe Gitlab::UsageData::Topology do
})
end
end
context 'with timeout failures' do
where(:exception) do
described_class::TIMEOUT_ERRORS
end
with_them do
it 'returns empty result and cancelled subsequent queries' do
expect_prometheus_api_to receive(:query)
.and_raise(exception)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => exception.to_s },
{ 'node_memory' => 'timeout_cancellation' },
{ 'node_memory_utilization' => 'timeout_cancellation' },
{ 'node_cpus' => 'timeout_cancellation' },
{ 'node_cpu_utilization' => 'timeout_cancellation' },
{ 'node_uname_info' => 'timeout_cancellation' },
{ 'service_rss' => 'timeout_cancellation' },
{ 'service_uss' => 'timeout_cancellation' },
{ 'service_pss' => 'timeout_cancellation' },
{ 'service_process_count' => 'timeout_cancellation' },
{ 'service_workers' => 'timeout_cancellation' }
],
nodes: []
})
end
end
end
end
end
context 'when embedded Prometheus server is disabled' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment