Commit 4e499a76 authored by Matthias Käppler's avatar Matthias Käppler Committed by James Lopez

Track failures in usage ping payload

This exports a `failures` field carrying information
about failed queries during topology data collection.
parent 0d34cf28
......@@ -594,6 +594,7 @@ appear to be associated to any of the services running, since they all appear to
| `ldap_enabled` | | | | | |
| `mattermost_enabled` | | | | | |
| `omniauth_enabled` | | | | | |
| `prometheus_enabled` | | | | | Whether the bundled Prometheus is enabled |
| `prometheus_metrics_enabled` | | | | | |
| `reply_by_email_enabled` | | | | | |
| `average` | `avg_cycle_analytics - code` | | | | |
......@@ -671,6 +672,7 @@ appear to be associated to any of the services running, since they all appear to
| `merge_requests_users` | `usage_activity_by_stage_monthly` | `create` | | | Unique count of users who used a merge request |
| `duration_s` | `topology` | `enablement` | | | Time it took to collect topology data |
| `application_requests_per_hour` | `topology` | `enablement` | | | Number of requests to the web application per hour |
| `failures` | `topology` | `enablement` | | | Contains information about failed queries |
| `nodes` | `topology` | `enablement` | | | The list of server nodes on which GitLab components are running |
| `node_memory_total_bytes` | `topology > nodes` | `enablement` | | | The total available memory of this node |
| `node_cpus` | `topology > nodes` | `enablement` | | | The number of CPU cores of this node |
......@@ -723,6 +725,7 @@ The following is example content of the Usage Ping payload.
"ldap_enabled": false,
"mattermost_enabled": false,
"omniauth_enabled": true,
"prometheus_enabled": false,
"prometheus_metrics_enabled": false,
"reply_by_email_enabled": "incoming+%{key}@incoming.gitlab.com",
"signup_enabled": true,
......@@ -879,6 +882,7 @@ The following is example content of the Usage Ping payload.
"topology": {
"duration_s": 0.013836685999194742,
"application_requests_per_hour": 4224,
"failures": [],
"nodes": [
{
"node_memory_total_bytes": 33269903360,
......
......@@ -5,6 +5,8 @@ module Gitlab
class PrometheusClient
include Gitlab::Utils::StrongMemoize
Error = Class.new(StandardError)
ConnectionError = Class.new(Gitlab::PrometheusClient::Error)
UnexpectedResponseError = Class.new(Gitlab::PrometheusClient::Error)
QueryError = Class.new(Gitlab::PrometheusClient::Error)
HEALTHY_RESPONSE = "Prometheus is Healthy.\n"
......@@ -44,7 +46,7 @@ module Gitlab
path = api_path(type)
get(path, args)
rescue Gitlab::HTTP::ResponseError => ex
raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code)
raise PrometheusClient::ConnectionError, "Network connection error" unless ex.response && ex.response.try(:code)
handle_querying_api_response(ex.response)
end
......@@ -115,7 +117,7 @@ module Gitlab
response = get(path, args)
handle_querying_api_response(response)
rescue Gitlab::HTTP::ResponseError => ex
raise PrometheusClient::Error, "Network connection error" unless ex.response && ex.response.try(:code)
raise PrometheusClient::ConnectionError, "Network connection error" unless ex.response && ex.response.try(:code)
handle_querying_api_response(ex.response)
end
......@@ -137,18 +139,18 @@ module Gitlab
def get(path, args)
Gitlab::HTTP.get(path, { query: args }.merge(http_options) )
rescue SocketError
raise PrometheusClient::Error, "Can't connect to #{api_url}"
raise PrometheusClient::ConnectionError, "Can't connect to #{api_url}"
rescue OpenSSL::SSL::SSLError
raise PrometheusClient::Error, "#{api_url} contains invalid SSL data"
raise PrometheusClient::ConnectionError, "#{api_url} contains invalid SSL data"
rescue Errno::ECONNREFUSED
raise PrometheusClient::Error, 'Connection refused'
raise PrometheusClient::ConnectionError, 'Connection refused'
end
def handle_management_api_response(response)
if response.code == 200
response.body
else
raise PrometheusClient::Error, "#{response.code} - #{response.body}"
raise PrometheusClient::UnexpectedResponseError, "#{response.code} - #{response.body}"
end
end
......@@ -156,7 +158,7 @@ module Gitlab
response_code = response.try(:code)
response_body = response.try(:body)
raise PrometheusClient::Error, "#{response_code} - #{response_body}" unless response_code
raise PrometheusClient::UnexpectedResponseError, "#{response_code} - #{response_body}" unless response_code
json_data = parse_json(response_body) if [200, 400].include?(response_code)
......@@ -166,7 +168,7 @@ module Gitlab
when 400
raise PrometheusClient::QueryError, json_data['error'] || 'Bad data received'
else
raise PrometheusClient::Error, "#{response_code} - #{response_body}"
raise PrometheusClient::UnexpectedResponseError, "#{response_code} - #{response_body}"
end
end
......@@ -178,7 +180,7 @@ module Gitlab
def parse_json(response_body)
Gitlab::Json.parse(response_body, legacy_mode: true)
rescue JSON::ParserError
raise PrometheusClient::Error, 'Parsing response failed'
raise PrometheusClient::UnexpectedResponseError, 'Parsing response failed'
end
end
end
......@@ -18,7 +18,6 @@ module Gitlab
class << self
include Gitlab::Utils::UsageData
include Gitlab::Utils::StrongMemoize
include Gitlab::UsageDataConcerns::Topology
def data(force_refresh: false)
Rails.cache.fetch('usage_data', force: force_refresh, expires_in: 2.weeks) do
......@@ -210,6 +209,7 @@ module Gitlab
ldap_enabled: alt_usage_data(fallback: nil) { Gitlab.config.ldap.enabled },
mattermost_enabled: alt_usage_data(fallback: nil) { Gitlab.config.mattermost.enabled },
omniauth_enabled: alt_usage_data(fallback: nil) { Gitlab::Auth.omniauth_enabled? },
prometheus_enabled: alt_usage_data(fallback: nil) { Gitlab::Prometheus::Internal.prometheus_enabled? },
prometheus_metrics_enabled: alt_usage_data(fallback: nil) { Gitlab::Metrics.prometheus_metrics_enabled? },
reply_by_email_enabled: alt_usage_data(fallback: nil) { Gitlab::IncomingEmail.enabled? },
signup_enabled: alt_usage_data(fallback: nil) { Gitlab::CurrentSettings.allow_signup? },
......@@ -303,6 +303,10 @@ module Gitlab
}
end
def topology_usage_data
Gitlab::UsageData::Topology.new.topology_usage_data
end
def ingress_modsecurity_usage
##
# This method measures usage of the Modsecurity Web Application Firewall across the entire
......
# frozen_string_literal: true
module Gitlab
module UsageDataConcerns
module Topology
class UsageData
class Topology
include Gitlab::Utils::UsageData
JOB_TO_SERVICE_NAME = {
......@@ -16,11 +16,20 @@ module Gitlab
'node' => 'node-exporter'
}.freeze
def topology_usage_data
topology_data, duration = measure_duration do
alt_usage_data(fallback: {}) { topology_fetch_all_data }
CollectionFailure = Struct.new(:query, :error) do
def to_h
{ query => error }
end
end
{ topology: topology_data.merge(duration_s: duration) }
def topology_usage_data
@failures = []
topology_data, duration = measure_duration { topology_fetch_all_data }
{
topology: topology_data
.merge(duration_s: duration)
.merge(failures: @failures.map(&:to_h))
}
end
private
......@@ -32,10 +41,17 @@ module Gitlab
nodes: topology_node_data(client)
}.compact
end
rescue => e
@failures << CollectionFailure.new('other', e.class.to_s)
{}
end
def topology_app_requests_per_hour(client)
result = client.query(one_week_average('gitlab_usage_ping:ops:rate5m')).first
result = query_safely('gitlab_usage_ping:ops:rate5m', 'app_requests', fallback: nil) do |query|
client.query(one_week_average(query)).first
end
return unless result
# the metric is recorded as a per-second rate
......@@ -62,11 +78,15 @@ module Gitlab
end
def topology_node_memory(client)
aggregate_by_instance(client, 'gitlab_usage_ping:node_memory_total_bytes:avg')
query_safely('gitlab_usage_ping:node_memory_total_bytes:avg', 'node_memory', fallback: {}) do |query|
aggregate_by_instance(client, query)
end
end
def topology_node_cpus(client)
aggregate_by_instance(client, 'gitlab_usage_ping:node_cpus:count')
query_safely('gitlab_usage_ping:node_cpus:count', 'node_cpus', fallback: {}) do |query|
aggregate_by_instance(client, query)
end
end
def topology_all_service_memory(client)
......@@ -78,19 +98,39 @@ module Gitlab
end
def topology_service_memory_rss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_resident_memory_bytes:avg')
query_safely(
'gitlab_usage_ping:node_service_process_resident_memory_bytes:avg', 'service_rss', fallback: []
) { |query| aggregate_by_labels(client, query) }
end
def topology_service_memory_uss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_unique_memory_bytes:avg')
query_safely(
'gitlab_usage_ping:node_service_process_unique_memory_bytes:avg', 'service_uss', fallback: []
) { |query| aggregate_by_labels(client, query) }
end
def topology_service_memory_pss(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process_proportional_memory_bytes:avg')
query_safely(
'gitlab_usage_ping:node_service_process_proportional_memory_bytes:avg', 'service_pss', fallback: []
) { |query| aggregate_by_labels(client, query) }
end
def topology_all_service_process_count(client)
aggregate_by_labels(client, 'gitlab_usage_ping:node_service_process:count')
query_safely(
'gitlab_usage_ping:node_service_process:count', 'service_process_count', fallback: []
) { |query| aggregate_by_labels(client, query) }
end
def query_safely(query, query_name, fallback:)
result = yield query
return result if result.present?
@failures << CollectionFailure.new(query_name, 'empty_result')
fallback
rescue => e
@failures << CollectionFailure.new(query_name, e.class.to_s)
fallback
end
def topology_node_services(instance, all_process_counts, all_process_memory)
......
......@@ -32,7 +32,7 @@ RSpec.describe Gitlab::PrometheusClient do
it 'raises error when status code not 200' do
stub_request(:get, subject.health_url).to_return(status: 500, body: '')
expect { subject.healthy? }.to raise_error(Gitlab::PrometheusClient::Error)
expect { subject.healthy? }.to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError)
end
end
......@@ -41,41 +41,41 @@ RSpec.describe Gitlab::PrometheusClient do
# - execute_query: A query call
shared_examples 'failure response' do
context 'when request returns 400 with an error message' do
it 'raises a Gitlab::PrometheusClient::Error error' do
it 'raises a Gitlab::PrometheusClient::QueryError error' do
req_stub = stub_prometheus_request(query_url, status: 400, body: { error: 'bar!' })
expect { execute_query }
.to raise_error(Gitlab::PrometheusClient::Error, 'bar!')
.to raise_error(Gitlab::PrometheusClient::QueryError, 'bar!')
expect(req_stub).to have_been_requested
end
end
context 'when request returns 400 without an error message' do
it 'raises a Gitlab::PrometheusClient::Error error' do
it 'raises a Gitlab::PrometheusClient::QueryError error' do
req_stub = stub_prometheus_request(query_url, status: 400)
expect { execute_query }
.to raise_error(Gitlab::PrometheusClient::Error, 'Bad data received')
.to raise_error(Gitlab::PrometheusClient::QueryError, 'Bad data received')
expect(req_stub).to have_been_requested
end
end
context 'when request returns 500' do
it 'raises a Gitlab::PrometheusClient::Error error' do
it 'raises a Gitlab::PrometheusClient::UnexpectedResponseError error' do
req_stub = stub_prometheus_request(query_url, status: 500, body: { message: 'FAIL!' })
expect { execute_query }
.to raise_error(Gitlab::PrometheusClient::Error, '500 - {"message":"FAIL!"}')
.to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError, '500 - {"message":"FAIL!"}')
expect(req_stub).to have_been_requested
end
end
context 'when request returns non json data' do
it 'raises a Gitlab::PrometheusClient::Error error' do
it 'raises a Gitlab::PrometheusClient::UnexpectedResponseError error' do
req_stub = stub_prometheus_request(query_url, status: 200, body: 'not json')
expect { execute_query }
.to raise_error(Gitlab::PrometheusClient::Error, 'Parsing response failed')
.to raise_error(Gitlab::PrometheusClient::UnexpectedResponseError, 'Parsing response failed')
expect(req_stub).to have_been_requested
end
end
......@@ -85,35 +85,35 @@ RSpec.describe Gitlab::PrometheusClient do
let(:prometheus_url) {"https://prometheus.invalid.example.com/api/v1/query?query=1"}
shared_examples 'exceptions are raised' do
it 'raises a Gitlab::PrometheusClient::Error error when a SocketError is rescued' do
it 'raises a Gitlab::PrometheusClient::ConnectionError error when a SocketError is rescued' do
req_stub = stub_prometheus_request_with_exception(prometheus_url, SocketError)
expect { subject }
.to raise_error(Gitlab::PrometheusClient::Error, "Can't connect to #{prometheus_url}")
.to raise_error(Gitlab::PrometheusClient::ConnectionError, "Can't connect to #{prometheus_url}")
expect(req_stub).to have_been_requested
end
it 'raises a Gitlab::PrometheusClient::Error error when a SSLError is rescued' do
it 'raises a Gitlab::PrometheusClient::ConnectionError error when a SSLError is rescued' do
req_stub = stub_prometheus_request_with_exception(prometheus_url, OpenSSL::SSL::SSLError)
expect { subject }
.to raise_error(Gitlab::PrometheusClient::Error, "#{prometheus_url} contains invalid SSL data")
.to raise_error(Gitlab::PrometheusClient::ConnectionError, "#{prometheus_url} contains invalid SSL data")
expect(req_stub).to have_been_requested
end
it 'raises a Gitlab::PrometheusClient::Error error when a Gitlab::HTTP::ResponseError is rescued' do
it 'raises a Gitlab::PrometheusClient::ConnectionError error when a Gitlab::HTTP::ResponseError is rescued' do
req_stub = stub_prometheus_request_with_exception(prometheus_url, Gitlab::HTTP::ResponseError)
expect { subject }
.to raise_error(Gitlab::PrometheusClient::Error, "Network connection error")
.to raise_error(Gitlab::PrometheusClient::ConnectionError, "Network connection error")
expect(req_stub).to have_been_requested
end
it 'raises a Gitlab::PrometheusClient::Error error when a Gitlab::HTTP::ResponseError with a code is rescued' do
it 'raises a Gitlab::PrometheusClient::ConnectionError error when a Gitlab::HTTP::ResponseError with a code is rescued' do
req_stub = stub_prometheus_request_with_exception(prometheus_url, Gitlab::HTTP::ResponseError.new(code: 400))
expect { subject }
.to raise_error(Gitlab::PrometheusClient::Error, "Network connection error")
.to raise_error(Gitlab::PrometheusClient::ConnectionError, "Network connection error")
expect(req_stub).to have_been_requested
end
end
......@@ -400,9 +400,9 @@ RSpec.describe Gitlab::PrometheusClient do
context "without response code" do
let(:response_error) { Gitlab::HTTP::ResponseError }
it 'raises PrometheusClient::Error' do
it 'raises PrometheusClient::ConnectionError' do
expect { subject.proxy('query', { query: prometheus_query }) }.to(
raise_error(Gitlab::PrometheusClient::Error, 'Network connection error')
raise_error(Gitlab::PrometheusClient::ConnectionError, 'Network connection error')
)
end
end
......
......@@ -2,11 +2,11 @@
require 'spec_helper'
RSpec.describe Gitlab::UsageDataConcerns::Topology do
RSpec.describe Gitlab::UsageData::Topology do
include UsageDataHelpers
describe '#topology_usage_data' do
subject { Class.new.extend(described_class).topology_usage_data }
subject { described_class.new.topology_usage_data }
before do
# this pins down time shifts when benchmarking durations
......@@ -34,6 +34,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
expect(subject[:topology]).to eq({
duration_s: 0,
application_requests_per_hour: 36,
failures: [],
nodes: [
{
node_memory_total_bytes: 512,
......@@ -76,7 +77,7 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
end
context 'and some node memory metrics are missing' do
it 'removes the respective entries' do
it 'removes the respective entries and includes the failures' do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
......@@ -89,6 +90,12 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'service_rss' => 'empty_result' },
{ 'service_uss' => 'empty_result' }
],
nodes: [
{
node_cpus: 16,
......@@ -123,31 +130,50 @@ RSpec.describe Gitlab::UsageDataConcerns::Topology do
end
end
context 'and no results are found' do
it 'does not report anything' do
expect_prometheus_api_to receive(:query).at_least(:once).and_return({})
context 'and an error is raised when querying Prometheus' do
it 'returns empty result with failures' do
expect_prometheus_api_to receive(:query)
.at_least(:once)
.and_raise(Gitlab::PrometheusClient::ConnectionError)
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_pss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_process_count' => 'Gitlab::PrometheusClient::ConnectionError' }
],
nodes: []
})
end
end
end
context 'and a connection error is raised' do
it 'does not report anything' do
expect_prometheus_api_to receive(:query).and_raise('Connection failed')
context 'when embedded Prometheus server is disabled' do
it 'returns empty result with no failures' do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
expect(subject[:topology]).to eq({ duration_s: 0 })
end
expect(subject[:topology]).to eq({
duration_s: 0,
failures: []
})
end
end
context 'when embedded Prometheus server is disabled' do
it 'does not report anything' do
expect(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
context 'when top-level function raises error' do
it 'returns empty result with generic failure' do
allow(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_raise(RuntimeError)
expect(subject[:topology]).to eq({ duration_s: 0 })
expect(subject[:topology]).to eq({
duration_s: 0,
failures: [
{ 'other' => 'RuntimeError' }
]
})
end
end
end
......
......@@ -347,6 +347,20 @@ RSpec.describe Gitlab::UsageData, :aggregate_failures do
expect(subject[:grafana_link_enabled]).to eq(Gitlab::CurrentSettings.grafana_enabled?)
end
context 'with embedded Prometheus' do
it 'returns true when embedded Prometheus is enabled' do
allow(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(true)
expect(subject[:prometheus_enabled]).to eq(true)
end
it 'returns false when embedded Prometheus is disabled' do
allow(Gitlab::Prometheus::Internal).to receive(:prometheus_enabled?).and_return(false)
expect(subject[:prometheus_enabled]).to eq(false)
end
end
context 'with embedded grafana' do
it 'returns true when embedded grafana is enabled' do
stub_application_setting(grafana_enabled: true)
......
......@@ -23,7 +23,7 @@ RSpec.describe PrometheusService, :use_clean_rails_memory_store_caching do
# result = { success: false, result: error }
expect(result[:success]).to be_falsy
expect(result[:result]).to be_instance_of(Gitlab::PrometheusClient::Error)
expect(result[:result]).to be_instance_of(Gitlab::PrometheusClient::UnexpectedResponseError)
expect(redirect_req_stub).to have_been_requested
expect(redirected_req_stub).not_to have_been_requested
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment