Commit 123f70cc authored by Peter Leitzen's avatar Peter Leitzen

Merge branch '232786-respect-stop-query-failures' into 'master'

Skip subsequent topology Prometheus queries if timeout occur

See merge request gitlab-org/gitlab!38293
parents 5eae1765 32958f14
---
title: Skip subsequent topology Prometheus queries if timeout occur
merge_request: 38293
author:
type: performance
...@@ -17,6 +17,9 @@ module Gitlab ...@@ -17,6 +17,9 @@ module Gitlab
'registry' => 'registry' 'registry' => 'registry'
}.freeze }.freeze
# If these errors occur, all subsequent queries are likely to fail for the same error
TIMEOUT_ERRORS = [Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout].freeze
CollectionFailure = Struct.new(:query, :error) do CollectionFailure = Struct.new(:query, :error) do
def to_h def to_h
{ query => error } { query => error }
...@@ -158,6 +161,11 @@ module Gitlab ...@@ -158,6 +161,11 @@ module Gitlab
end end
def query_safely(query, query_name, fallback:) def query_safely(query, query_name, fallback:)
if timeout_error_exists?
@failures << CollectionFailure.new(query_name, 'timeout_cancellation')
return fallback
end
result = yield query result = yield query
return result if result.present? return result if result.present?
...@@ -169,6 +177,14 @@ module Gitlab ...@@ -169,6 +177,14 @@ module Gitlab
fallback fallback
end end
def timeout_error_exists?
timeout_error_names = TIMEOUT_ERRORS.map(&:to_s).to_set
@failures.any? do |failure|
timeout_error_names.include?(failure.error)
end
end
def topology_node_services(instance, all_process_counts, all_process_memory, all_server_types) def topology_node_services(instance, all_process_counts, all_process_memory, all_server_types)
# returns all node service data grouped by service name as the key # returns all node service data grouped by service name as the key
instance_service_data = instance_service_data =
......
...@@ -402,28 +402,61 @@ RSpec.describe Gitlab::UsageData::Topology do ...@@ -402,28 +402,61 @@ RSpec.describe Gitlab::UsageData::Topology do
end end
context 'and an error is raised when querying Prometheus' do context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do context 'without timeout failures' do
expect_prometheus_api_to receive(:query) it 'returns empty result and executes subsequent queries as usual' do
.at_least(:once) expect_prometheus_api_to receive(:query)
.and_raise(Gitlab::PrometheusClient::ConnectionError) .at_least(:once)
.and_raise(Gitlab::PrometheusClient::ConnectionError)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
],
nodes: []
})
end
end
expect(subject[:topology]).to eq({ context 'with timeout failures' do
duration_s: 0, where(:exception) do
failures: [ described_class::TIMEOUT_ERRORS
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' }, end
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' }, with_them do
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' }, it 'returns empty result and cancelled subsequent queries' do
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' }, expect_prometheus_api_to receive(:query)
{ 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' }, .and_raise(exception)
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' }, expect(subject[:topology]).to eq({
{ 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' }, duration_s: 0,
{ 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' }, failures: [
{ 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' } { 'app_requests' => exception.to_s },
], { 'node_memory' => 'timeout_cancellation' },
nodes: [] { 'node_memory_utilization' => 'timeout_cancellation' },
}) { 'node_cpus' => 'timeout_cancellation' },
{ 'node_cpu_utilization' => 'timeout_cancellation' },
{ 'node_uname_info' => 'timeout_cancellation' },
{ 'service_rss' => 'timeout_cancellation' },
{ 'service_uss' => 'timeout_cancellation' },
{ 'service_pss' => 'timeout_cancellation' },
{ 'service_process_count' => 'timeout_cancellation' },
{ 'service_workers' => 'timeout_cancellation' }
],
nodes: []
})
end
end
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment