Merge branch '232786-respect-stop-query-failures' into 'master'

Skip subsequent topology Prometheus queries if timeout occur See merge request gitlab-org/gitlab!38293

Merge branch '232786-respect-stop-query-failures' into 'master'
Skip subsequent topology Prometheus queries if timeout occur See merge request gitlab-org/gitlab!38293
123f70cc · Peter Leitzen · 5eae1765 · 32958f14 · 123f70cc · 123f70cc
Commit 123f70cc authored Aug 13, 2020 by Peter Leitzen
3 changed files
--- a/changelogs/unreleased/232786-respect-stop-query-failures.yml
+++ b/changelogs/unreleased/232786-respect-stop-query-failures.yml
+---
+title: Skip subsequent topology Prometheus queries if timeout occur
+merge_request: 38293
+author:
+type: performance
--- a/lib/gitlab/usage_data/topology.rb
+++ b/lib/gitlab/usage_data/topology.rb
@@ -17,6 +17,9 @@ module Gitlab
        'registry' => 'registry'
      }.freeze
+      # If these errors occur, all subsequent queries are likely to fail for the same error
+      TIMEOUT_ERRORS = [Errno::ETIMEDOUT, Net::OpenTimeout, Net::ReadTimeout].freeze
      CollectionFailure = Struct.new(:query, :error) do
        def to_h
          { query => error }
@@ -158,6 +161,11 @@ module Gitlab
      end
      def query_safely(query, query_name, fallback:)
+        if timeout_error_exists?
+          @failures << CollectionFailure.new(query_name, 'timeout_cancellation')
+          return fallback
+        end
        result = yield query
        return result if result.present?
@@ -169,6 +177,14 @@ module Gitlab
        fallback
      end
+      def timeout_error_exists?
+        timeout_error_names = TIMEOUT_ERRORS.map(&:to_s).to_set
+        @failures.any? do |failure|
+          timeout_error_names.include?(failure.error)
+        end
+      end
      def topology_node_services(instance, all_process_counts, all_process_memory, all_server_types)
        # returns all node service data grouped by service name as the key
        instance_service_data =

--- a/spec/lib/gitlab/usage_data/topology_spec.rb
+++ b/spec/lib/gitlab/usage_data/topology_spec.rb
@@ -402,28 +402,61 @@ RSpec.describe Gitlab::UsageData::Topology do
      end
      context 'and an error is raised when querying Prometheus' do
-        it 'returns empty result with failures' do
+        context 'without timeout failures' do
-          expect_prometheus_api_to receive(:query)
+          it 'returns empty result and executes subsequent queries as usual' do
-            .at_least(:once)
+            expect_prometheus_api_to receive(:query)
-            .and_raise(Gitlab::PrometheusClient::ConnectionError)
+              .at_least(:once)
+              .and_raise(Gitlab::PrometheusClient::ConnectionError)
+            expect(subject[:topology]).to eq({
+              duration_s: 0,
+              failures: [
+                { 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
+                { 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
+              ],
+              nodes: []
+            })
+          end
+        end
-          expect(subject[:topology]).to eq({
+        context 'with timeout failures' do
-            duration_s: 0,
+          where(:exception) do
-            failures: [
+            described_class::TIMEOUT_ERRORS
-              { 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
+          end
-              { 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
-              { 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
+          with_them do
-              { 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
+            it 'returns empty result and cancelled subsequent queries' do
-              { 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
+              expect_prometheus_api_to receive(:query)
-              { 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
+                .and_raise(exception)
-              { 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
-              { 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
+              expect(subject[:topology]).to eq({
-              { 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
+                duration_s: 0,
-              { 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' },
+                failures: [
-              { 'service_workers' => 'Gitlab::PrometheusClient::ConnectionError' }
+                  { 'app_requests' => exception.to_s },
-            ],
+                  { 'node_memory' => 'timeout_cancellation' },
-            nodes: []
+                  { 'node_memory_utilization' => 'timeout_cancellation' },
-          })
+                  { 'node_cpus' => 'timeout_cancellation' },
+                  { 'node_cpu_utilization' => 'timeout_cancellation' },
+                  { 'node_uname_info' => 'timeout_cancellation' },
+                  { 'service_rss' => 'timeout_cancellation' },
+                  { 'service_uss' => 'timeout_cancellation' },
+                  { 'service_pss' => 'timeout_cancellation' },
+                  { 'service_process_count' => 'timeout_cancellation' },
+                  { 'service_workers' => 'timeout_cancellation' }
+                ],
+                nodes: []
+              })
+            end
+          end
        end
      end
    end