Track failures in usage ping payload

This exports a `failures` field carrying information about failed queries during topology data collection.

Track failures in usage ping payload
This exports a `failures` field carrying information about failed queries during topology data collection.
4e499a76 · Matthias Käppler · James Lopez · 0d34cf28 · 4e499a76 · 4e499a76
Commit 4e499a76 authored Jul 03, 2020 by Matthias Käppler Committed by James Lopez Jul 03, 2020
8 changed files
--- a/doc/development/telemetry/usage_ping.md
+++ b/doc/development/telemetry/usage_ping.md
@@ -594,6 +594,7 @@ appear to be associated to any of the services running, since they all appear to
 | `ldap_enabled`                                            |                                      |               |                  |         |                                                                            |
 | `mattermost_enabled`                                      |                                      |               |                  |         |                                                                            |
 | `omniauth_enabled`                                        |                                      |               |                  |         |                                                                            |
+| `prometheus_enabled`                                      |                                      |               |                  |         | Whether the bundled Prometheus is enabled                                  |
 | `prometheus_metrics_enabled`                              |                                      |               |                  |         |                                                                            |
 | `reply_by_email_enabled`                                  |                                      |               |                  |         |                                                                            |
 | `average`                                                 | `avg_cycle_analytics - code`         |               |                  |         |                                                                            |
@@ -671,6 +672,7 @@ appear to be associated to any of the services running, since they all appear to
 | `merge_requests_users`                                    | `usage_activity_by_stage_monthly`    | `create`      |                  |         | Unique count of users who used a merge request                             |
 | `duration_s`                                              | `topology`                           | `enablement`  |                  |         | Time it took to collect topology data                                      |
 | `application_requests_per_hour`                           | `topology`                           | `enablement`  |                  |         | Number of requests to the web application per hour                         |
+| `failures`                                                | `topology`                           | `enablement`  |                  |         | Contains information about failed queries                                  |
 | `nodes`                                                   | `topology`                           | `enablement`  |                  |         | The list of server nodes on which GitLab components are running            |
 | `node_memory_total_bytes`                                 | `topology > nodes`                   | `enablement`  |                  |         | The total available memory of this node                                    |
 | `node_cpus`                                               | `topology > nodes`                   | `enablement`  |                  |         | The number of CPU cores of this node                                       |
@@ -723,6 +725,7 @@ The following is example content of the Usage Ping payload.
  "ldap_enabled": false,
  "mattermost_enabled": false,
  "omniauth_enabled": true,
+  "prometheus_enabled": false,
  "prometheus_metrics_enabled": false,
  "reply_by_email_enabled": "incoming+%{key}@incoming.gitlab.com",
  "signup_enabled": true,
@@ -879,6 +882,7 @@ The following is example content of the Usage Ping payload.
  "topology": {
    "duration_s": 0.013836685999194742,
    "application_requests_per_hour": 4224,
+    "failures": [],
    "nodes": [
      {
        "node_memory_total_bytes": 33269903360,

--- a/lib/gitlab/prometheus_client.rb
+++ b/lib/gitlab/prometheus_client.rb
@@ -5,6 +5,8 @@ module Gitlab
  class PrometheusClient
    include Gitlab::Utils::StrongMemoize
    Error = Class.new(StandardError)
+    ConnectionError = Class.new(Gitlab::PrometheusClient::Error)
+    UnexpectedResponseError = Class.new(Gitlab::PrometheusClient::Error)
    QueryError = Class.new(Gitlab::PrometheusClient::Error)
    HEALTHY_RESPONSE = "Prometheus is Healthy.\n"

@@ -44,7 +46,7 @@ module Gitlab
      path = api_path(type)
      get(path, args)
    rescue Gitlab::HTTP::ResponseError => ex
-      raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code)
+      raise PrometheusClient::ConnectionError, "Network connection error" unless ex.response && ex.response.try(:code)

      handle_querying_api_response(ex.response)
    end
@@ -115,7 +117,7 @@ module Gitlab
      response = get(path, args)
      handle_querying_api_response(response)
    rescue Gitlab::HTTP::ResponseError => ex
-      raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code)
+      raise PrometheusClient::ConnectionError, "Network connection error" unless ex.response && ex.response.try(:code)

      handle_querying_api_response(ex.response)
    end
@@ -137,18 +139,18 @@ module Gitlab
    def get(path, args)
      Gitlab::HTTP.get(path, { query: args }.merge(http_options) )
    rescue SocketError
-      raise PrometheusClient::Error, "Can't connect to #{api_url}"
+      raise PrometheusClient::ConnectionError, "Can't connect to #{api_url}"
    rescue OpenSSL::SSL::SSLError
-      raise PrometheusClient::Error, "#{api_url} contains invalid SSL data"
+      raise PrometheusClient::ConnectionError, "#{api_url} contains invalid SSL data"
    rescue Errno::ECONNREFUSED
-      raise PrometheusClient::Error, 'Connection refused'
+      raise PrometheusClient::ConnectionError, 'Connection refused'
    end

    def handle_management_api_response(response)
      if response.code == 200
        response.body
      else
-        raise PrometheusClient::Error, "#{response.code} - #{response.body}"
+        raise PrometheusClient::UnexpectedResponseError, "#{response.code} - #{response.body}"
      end
    end

@@ -156,7 +158,7 @@ module Gitlab
      response_code = response.try(:code)
      response_body = response.try(:body)

-      raise PrometheusClient::Error, "#{response_code} - #{response_body}" unless response_code
+      raise PrometheusClient::UnexpectedResponseError, "#{response_code} - #{response_body}" unless response_code

      json_data = parse_json(response_body) if [200, 400].include?(response_code)

@@ -166,7 +168,7 @@ module Gitlab
      when 400
        raise PrometheusClient::QueryError, json_data['error'] || 'Bad data received'
      else
-        raise PrometheusClient::Error, "#{response_code} - #{response_body}"
+        raise PrometheusClient::UnexpectedResponseError, "#{response_code} - #{response_body}"
      end
    end

@@ -178,7 +180,7 @@ module Gitlab
    def parse_json(response_body)
      Gitlab::Json.parse(response_body, legacy_mode: true)
    rescue JSON::ParserError
-      raise PrometheusClient::Error, 'Parsing response failed'
+      raise PrometheusClient::UnexpectedResponseError, 'Parsing response failed'
    end
  end
 end
--- a/lib/gitlab/usage_data.rb
+++ b/lib/gitlab/usage_data.rb
@@ -18,7 +18,6 @@ module Gitlab
    class << self
      include Gitlab::Utils::UsageData
      include Gitlab::Utils::StrongMemoize
-      include Gitlab::UsageDataConcerns::Topology

      def data(force_refresh: false)
        Rails.cache.fetch('usage_data', force: force_refresh, expires_in: 2.weeks) do
@@ -210,6 +209,7 @@ module Gitlab
          ldap_enabled: alt_usage_data(fallback: nil) { Gitlab.config.ldap.enabled },
          mattermost_enabled: alt_usage_data(fallback: nil) { Gitlab.config.mattermost.enabled },
          omniauth_enabled: alt_usage_data(fallback: nil) { Gitlab::Auth.omniauth_enabled? },
+          prometheus_enabled: alt_usage_data(fallback: nil) { Gitlab::Prometheus::Internal.prometheus_enabled? },
          prometheus_metrics_enabled: alt_usage_data(fallback: nil) { Gitlab::Metrics.prometheus_metrics_enabled? },
          reply_by_email_enabled: alt_usage_data(fallback: nil) { Gitlab::IncomingEmail.enabled? },
          signup_enabled: alt_usage_data(fallback: nil) { Gitlab::CurrentSettings.allow_signup? },
@@ -303,6 +303,10 @@ module Gitlab
        }
      end

+      def topology_usage_data
+        Gitlab::UsageData::Topology.new.topology_usage_data
+      end
+
      def ingress_modsecurity_usage
        ##
        # This method measures usage of the Modsecurity Web Application Firewall across the entire

--- a/lib/gitlab/usage_data_concerns/topology.rb
+++ b/lib/gitlab/usage_data_concerns/topology.rb
 # frozen_string_literal: true

 module Gitlab
-  module UsageDataConcerns
-    module Topology
+  class UsageData
+    class Topology
      include Gitlab::Utils::UsageData

      JOB_TO_SERVICE_NAME = {
@@ -16,11 +16,20 @@ module Gitlab
        'node' => 'node-exporter'
      }.freeze

-      def topology_usage_data
-        topology_data, duration = measure_duration do
-          alt_usage_data(fallback: {}) { topology_fetch_all_data }
+      CollectionFailure = Struct.new(:query, :error) do
+        def to_h
+          { query => error }
        end
-        { topology: topology_data.merge(duration_s: duration) }
+      end
+
+      def topology_usage_data
+        @failures = []
+        topology_data, duration = measure_duration { topology_fetch_all_data }
+        {
+          topology: topology_data
+                      .merge(duration_s: duration)
+                      .merge(failures: @failures.map(&:to_h))
+        }
      end

      private
@@ -32,10 +41,17 @@ module Gitlab
            nodes: topology_node_data(client)
          }.compact
        end
+      rescue => e
+        @failures << CollectionFailure.new('other', e.class.to_s)
+
+        {}
      end

      def topology_app_requests_per_hour(client)
-        result = client.query(one_week_average('gitlab_usage_ping:ops:rate5m')).first
+        result = query_safely('gitlab_usage_ping:ops:rate5m', 'app_requests', fallback: nil) do |query|
+          client.query(one_week_average(query)).first
+        end
+
        return unless result

        # the metric is recorded as a per-second rate
@@ -62,11 +78,15 @@ module Gitlab
      end

      def topology_node_memory(client)
-        aggregate_by_instance(client, 'gitlab_usage_ping:node_memory_total_bytes:avg')
+        query_safely('gitlab_usage_ping:node_memory_total_bytes:avg', 'node_memory', fallback: {}) do |query|
+          aggregate_by_instance(client, query)
+        end
      end

      def topology_node_cpus(client)
-        aggregate_by_instance(client, 'gitlab_usage_ping:node_cpus:count')
+        query_safely('gitlab_usage_ping:node_cpus:count', 'node_cpus', fallback: {}) do |query|
+          aggregate_by_instance(client, query)
+        end
      end

      def topology_all_service_memory(client)
@@ -78,19 +98,39 @@ module Gitlab
      end

      def topology_service_memory_rss(client)
-        aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_resident_memory_bytes:avg')
+        query_safely(
+          'gitlab_usage_ping:node_service_process_resident_memory_bytes:avg', 'service_rss', fallback: []
+        ) { |query| aggregate_by_labels(client, query) }
      end

      def topology_service_memory_uss(client)
-        aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_unique_memory_bytes:avg')
+        query_safely(
+          'gitlab_usage_ping:node_service_process_unique_memory_bytes:avg', 'service_uss', fallback: []
+        ) { |query| aggregate_by_labels(client, query) }
      end

      def topology_service_memory_pss(client)
-        aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_proportional_memory_bytes:avg')
+        query_safely(
+          'gitlab_usage_ping:node_service_process_proportional_memory_bytes:avg', 'service_pss', fallback: []
+        ) { |query| aggregate_by_labels(client, query) }
      end

      def topology_all_service_process_count(client)
-        aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process:count')
+        query_safely(
+          'gitlab_usage_ping:node_service_process:count', 'service_process_count', fallback: []
+        ) { |query| aggregate_by_labels(client, query) }
+      end
+
+      def query_safely(query, query_name, fallback:)
+        result = yield query
+
+        return result if result.present?
+
+        @failures << CollectionFailure.new(query_name, 'empty_result')
+        fallback
+      rescue => e
+        @failures << CollectionFailure.new(query_name, e.class.to_s)
+        fallback
      end

      def topology_node_services(instance, all_process_counts, all_process_memory)

--- a/spec/lib/gitlab/prometheus_client_spec.rb
+++ b/spec/lib/gitlab/prometheus_client_spec.rb
@@ -32,7 +32,7 @@ RSpec.describe Gitlab::PrometheusClient do
    it 'raises error when status code not 200' do
      stub_request(:get, subject.health_url).to_return(status: 500, body: '')

-      expect { subject.healthy? }.to raise_error(Gitlab::PrometheusClient::Error)
+      expect { subject.healthy? }.to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError)
    end
  end

@@ -41,41 +41,41 @@ RSpec.describe Gitlab::PrometheusClient do
  # - execute_query: A query call
  shared_examples 'failure response' do
    context 'when request returns 400 with an error message' do
-      it 'raises a Gitlab::PrometheusClient::Error error' do
+      it 'raises a Gitlab::PrometheusClient::QueryError error' do
        req_stub = stub_prometheus_request(query_url, status: 400, body: { error: 'bar!' })

        expect { execute_query }
-          .to raise_error(Gitlab::PrometheusClient::Error, 'bar!')
+          .to raise_error(Gitlab::PrometheusClient::QueryError, 'bar!')
        expect(req_stub).to have_been_requested
      end
    end

    context 'when request returns 400 without an error message' do
-      it 'raises a Gitlab::PrometheusClient::Error error' do
+      it 'raises a Gitlab::PrometheusClient::QueryError error' do
        req_stub = stub_prometheus_request(query_url, status: 400)

        expect { execute_query }
-          .to raise_error(Gitlab::PrometheusClient::Error, 'Bad data received')
+          .to raise_error(Gitlab::PrometheusClient::QueryError, 'Bad data received')
        expect(req_stub).to have_been_requested
      end
    end

    context 'when request returns 500' do
-      it 'raises a Gitlab::PrometheusClient::Error error' do
+      it 'raises a Gitlab::PrometheusClient::UnexpectedResponseError error' do
        req_stub = stub_prometheus_request(query_url, status: 500, body: { message: 'FAIL!' })

        expect { execute_query }
-          .to raise_error(Gitlab::PrometheusClient::Error, '500 - {"message":"FAIL!"}')
+          .to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError, '500 - {"message":"FAIL!"}')
        expect(req_stub).to have_been_requested
      end
    end

    context 'when request returns non json data' do
-      it 'raises a Gitlab::PrometheusClient::Error error' do
+      it 'raises a Gitlab::PrometheusClient::UnexpectedResponseError error' do
        req_stub = stub_prometheus_request(query_url, status: 200, body: 'not json')

        expect { execute_query }
-          .to raise_error(Gitlab::PrometheusClient::Error, 'Parsing response failed')
+          .to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError, 'Parsing response failed')
        expect(req_stub).to have_been_requested
      end
    end
@@ -85,35 +85,35 @@ RSpec.describe Gitlab::PrometheusClient do
    let(:prometheus_url) {"https://prometheus.invalid.example.com/api/v1/query?query=1"}

    shared_examples 'exceptions are raised' do
-      it 'raises a Gitlab::PrometheusClient::Error error when a SocketError is rescued' do
+      it 'raises a Gitlab::PrometheusClient::ConnectionError error when a SocketError is rescued' do
        req_stub = stub_prometheus_request_with_exception(prometheus_url, SocketError)

        expect { subject }
-          .to raise_error(Gitlab::PrometheusClient::Error, "Can't connect to #{prometheus_url}")
+          .to raise_error(Gitlab::PrometheusClient::ConnectionError, "Can't connect to #{prometheus_url}")
        expect(req_stub).to have_been_requested
      end

-      it 'raises a Gitlab::PrometheusClient::Error error when a SSLError is rescued' do
+      it 'raises a Gitlab::PrometheusClient::ConnectionError error when a SSLError is rescued' do
        req_stub = stub_prometheus_request_with_exception(prometheus_url, OpenSSL::SSL::SSLError)

        expect { subject }
-          .to raise_error(Gitlab::PrometheusClient::Error, "#{prometheus_url} contains invalid SSL data")
+          .to raise_error(Gitlab::PrometheusClient::ConnectionError, "#{prometheus_url} contains invalid SSL data")
        expect(req_stub).to have_been_requested
      end

-      it 'raises a Gitlab::PrometheusClient::Error error when a Gitlab::HTTP::ResponseError is rescued' do
+      it 'raises a Gitlab::PrometheusClient::ConnectionError error when a Gitlab::HTTP::ResponseError is rescued' do
        req_stub = stub_prometheus_request_with_exception(prometheus_url, Gitlab::HTTP::ResponseError)

        expect { subject }
-          .to raise_error(Gitlab::PrometheusClient::Error, "Network connection error")
+          .to raise_error(Gitlab::PrometheusClient::ConnectionError, "Network connection error")
        expect(req_stub).to have_been_requested
      end

-      it 'raises a Gitlab::PrometheusClient::Error error when a Gitlab::HTTP::ResponseError with a code is rescued' do
+      it 'raises a Gitlab::PrometheusClient::ConnectionError error when a Gitlab::HTTP::ResponseError with a code is rescued' do
        req_stub = stub_prometheus_request_with_exception(prometheus_url, Gitlab::HTTP::ResponseError.new(code: 400))

        expect { subject }
-          .to raise_error(Gitlab::PrometheusClient::Error, "Network connection error")
+          .to raise_error(Gitlab::PrometheusClient::ConnectionError, "Network connection error")
        expect(req_stub).to have_been_requested
      end
    end
@@ -400,9 +400,9 @@ RSpec.describe Gitlab::PrometheusClient do
        context "without response code" do
          let(:response_error) { Gitlab::HTTP::ResponseError }

-          it 'raises PrometheusClient::Error' do
+          it 'raises PrometheusClient::ConnectionError' do
            expect { subject.proxy('query', { query: prometheus_query }) }.to(
-              raise_error(Gitlab::PrometheusClient::Error, 'Network connection error')
+              raise_error(Gitlab::PrometheusClient::ConnectionError, 'Network connection error')
            )
          end
        end

--- a/spec/lib/gitlab/usage_data_concerns/topology_spec.rb
+++ b/spec/lib/gitlab/usage_data_concerns/topology_spec.rb
@@ -2,11 +2,11 @@

 require 'spec_helper'

-RSpec.describe Gitlab::UsageDataConcerns::Topology do
+RSpec.describe Gitlab::UsageData::Topology do
  include UsageDataHelpers

  describe '#topology_usage_data' do
-    subject { Class.new.extend(described_class).topology_usage_data }
+    subject { described_class.new.topology_usage_data }

    before do
      # this pins down time shifts when benchmarking durations
@@ -34,6 +34,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
          expect(subject[:topology]).to eq({
            duration_s: 0,
            application_requests_per_hour: 36,
+            failures: [],
            nodes: [
              {
                node_memory_total_bytes: 512,
@@ -76,7 +77,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
      end

      context 'and some node memory metrics are missing' do
-        it 'removes the respective entries' do
+        it 'removes the respective entries and includes the failures' do
          expect_prometheus_api_to(
            receive_app_request_volume_query(result: []),
            receive_node_memory_query(result: []),
@@ -89,6 +90,12 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do

          expect(subject[:topology]).to eq({
            duration_s: 0,
+            failures: [
+              { 'app_requests' => 'empty_result' },
+              { 'node_memory' => 'empty_result' },
+              { 'service_rss' => 'empty_result' },
+              { 'service_uss' => 'empty_result' }
+            ],
            nodes: [
              {
                node_cpus: 16,
@@ -123,31 +130,50 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
        end
      end

-      context 'and no results are found' do
-        it 'does not report anything' do
-          expect_prometheus_api_to receive(:query).at_least(:once).and_return({})
+      context 'and an error is raised when querying Prometheus' do
+        it 'returns empty result with failures' do
+          expect_prometheus_api_to receive(:query)
+            .at_least(:once)
+            .and_raise(Gitlab::PrometheusClient::ConnectionError)

          expect(subject[:topology]).to eq({
            duration_s: 0,
+            failures: [
+              { 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
+              { 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
+              { 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
+              { 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
+              { 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
+              { 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
+              { 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' }
+            ],
            nodes: []
          })
        end
      end
+    end

-      context 'and a connection error is raised' do
-        it 'does not report anything' do
-          expect_prometheus_api_to receive(:query).and_raise('Connection failed')
+    context 'when embedded Prometheus server is disabled' do
+      it 'returns empty result with no failures' do
+        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)

-          expect(subject[:topology]).to eq({ duration_s: 0 })
-        end
+        expect(subject[:topology]).to eq({
+          duration_s: 0,
+          failures: []
+        })
      end
    end

-    context 'when embedded Prometheus server is disabled' do
-      it 'does not report anything' do
-        expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
+    context 'when top-level function raises error' do
+      it 'returns empty result with generic failure' do
+        allow(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_raise(RuntimeError)

-        expect(subject[:topology]).to eq({ duration_s: 0 })
+        expect(subject[:topology]).to eq({
+          duration_s: 0,
+          failures: [
+            { 'other' => 'RuntimeError' }
+          ]
+        })
      end
    end
  end

--- a/spec/lib/gitlab/usage_data_spec.rb
+++ b/spec/lib/gitlab/usage_data_spec.rb
@@ -347,6 +347,20 @@ RSpec.describe Gitlab::UsageData, :aggregate_failures do
        expect(subject[:grafana_link_enabled]).to eq(Gitlab::CurrentSettings.grafana_enabled?)
      end

+      context 'with embedded Prometheus' do
+        it 'returns true when embedded Prometheus is enabled' do
+          allow(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
+
+          expect(subject[:prometheus_enabled]).to eq(true)
+        end
+
+        it 'returns false when embedded Prometheus is disabled' do
+          allow(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
+
+          expect(subject[:prometheus_enabled]).to eq(false)
+        end
+      end
+
      context 'with embedded grafana' do
        it 'returns true when embedded grafana is enabled' do
          stub_application_setting(grafana_enabled: true)

--- a/spec/models/project_services/prometheus_service_spec.rb
+++ b/spec/models/project_services/prometheus_service_spec.rb
@@ -23,7 +23,7 @@ RSpec.describe PrometheusService, :use_clean_rails_memory_store_caching do

      # result = { success: false, result: error }
      expect(result[:success]).to be_falsy
-      expect(result[:result]).to be_instance_of(Gitlab::PrometheusClient::Error)
+      expect(result[:result]).to be_instance_of(Gitlab::PrometheusClient::UnexpectedResponseError)

      expect(redirect_req_stub).to have_been_requested
      expect(redirected_req_stub).not_to have_been_requested