Commit 197b1c3e authored by Fabio Pitino's avatar Fabio Pitino

Merge branch '259619-add-metric-for-tracking-invalid-traces' into 'master'

Add metrics to track invalid job traces

See merge request gitlab-org/gitlab!72001
parents dee08e12 3bda0e58
......@@ -73,9 +73,11 @@ module Ci
::Gitlab::Ci::Trace::Checksum.new(build).then do |checksum|
unless checksum.valid?
metrics.increment_trace_operation(operation: :invalid)
metrics.increment_error_counter(type: :chunks_invalid_checksum)
if checksum.corrupted?
metrics.increment_trace_operation(operation: :corrupted)
metrics.increment_error_counter(type: :chunks_invalid_size)
end
next unless log_invalid_chunks?
......
......@@ -122,7 +122,7 @@ The following metrics are available:
| `action_cable_single_client_transmissions_total` | Counter | 13.10 | The number of ActionCable messages transmitted to any client in any channel | `server_mode` |
| `action_cable_subscription_confirmations_total` | Counter | 13.10 | The number of ActionCable subscriptions from clients confirmed | `server_mode` |
| `action_cable_subscription_rejections_total` | Counter | 13.10 | The number of ActionCable subscriptions from clients rejected | `server_mode` |
| `action_cable_transmitted_bytes` | Histogram | 14.1 | Message size, in bytes, transmitted over action cable | `operation`, `channel` |
| `action_cable_transmitted_bytes` | Histogram | 14.1 | Message size, in bytes, transmitted over action cable | `operation`, `channel` |
| `gitlab_issuable_fast_count_by_state_total` | Counter | 13.5 | Total number of row count operations on issue/merge request list pages | |
| `gitlab_issuable_fast_count_by_state_failures_total` | Counter | 13.5 | Number of soft-failed row count operations on issue/merge request list pages | |
| `gitlab_external_http_total` | Counter | 13.8 | Total number of HTTP calls to external systems | `controller`, `action` |
......@@ -132,14 +132,15 @@ The following metrics are available:
| `pipeline_graph_link_calculation_duration_seconds` | Histogram | 13.9 | Total time spent calculating links, in seconds | |
| `pipeline_graph_links_total` | Histogram | 13.9 | Number of links per graph | |
| `pipeline_graph_links_per_job_ratio` | Histogram | 13.9 | Ratio of links to job per graph | |
| `gitlab_ci_pipeline_security_orchestration_policy_processing_duration_seconds` | Histogram | 13.12 | Time in seconds it takes to process Security Policies in CI/CD pipeline | |
| `gitlab_spamcheck_request_duration_seconds` | Histogram | 13.12 | The duration for requests between Rails and the anti-spam engine | |
| `gitlab_ci_pipeline_security_orchestration_policy_processing_duration_seconds` | Histogram | 13.12 | Time in seconds it takes to process Security Policies in CI/CD pipeline | |
| `gitlab_spamcheck_request_duration_seconds` | Histogram | 13.12 | The duration for requests between Rails and the anti-spam engine | |
| `service_desk_thank_you_email` | Counter | 14.0 | Total number of email responses to new service desk emails | |
| `service_desk_new_note_email` | Counter | 14.0 | Total number of email notifications on new service desk comment | |
| `email_receiver_error` | Counter | 14.1 | Total number of errors when processing incoming emails | |
| `gitlab_snowplow_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emitted | |
| `gitlab_snowplow_failed_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emission failures | |
| `gitlab_snowplow_successful_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emission successes | |
| `gitlab_ci_build_trace_errors_total` | Counter | 14.4 | Total amount of different error types on a build trace | `type` |
## Metrics controlled by a feature flag
......
......@@ -21,6 +21,12 @@ module Gitlab
:corrupted # malformed trace found after comparing CRC32 and size
].freeze
TRACE_ERROR_TYPES = [
:chunks_invalid_size, # used to be :corrupted
:chunks_invalid_checksum, # used to be :invalid
:archive_invalid_checksum # malformed trace found into object store after comparing MD5
].freeze
def increment_trace_operation(operation: :unknown)
unless OPERATIONS.include?(operation)
raise ArgumentError, "unknown trace operation: #{operation}"
......@@ -33,6 +39,14 @@ module Gitlab
self.class.trace_bytes.increment({}, size.to_i)
end
def increment_error_counter(type: :unknown)
unless TRACE_ERROR_TYPES.include?(type)
raise ArgumentError, "unknown error type: #{type}"
end
self.class.trace_errors_counter.increment(type: type)
end
def observe_migration_duration(seconds)
self.class.finalize_histogram.observe({}, seconds.to_f)
end
......@@ -65,6 +79,15 @@ module Gitlab
::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
def self.trace_errors_counter
strong_memoize(:trace_errors_counter) do
name = :gitlab_ci_build_trace_errors_total
comment = 'Total amount of different error types on a build trace'
Gitlab::Metrics.counter(name, comment)
end
end
end
end
end
......
......@@ -15,4 +15,27 @@ RSpec.describe Gitlab::Ci::Trace::Metrics, :prometheus do
end
end
end
describe '#increment_error_counter' do
context 'when the operation type is known' do
it 'increments the counter' do
subject.increment_error_counter(type: :chunks_invalid_size)
subject.increment_error_counter(type: :chunks_invalid_checksum)
subject.increment_error_counter(type: :archive_invalid_checksum)
expect(described_class.trace_errors_counter.get(type: :chunks_invalid_size)).to eq 1
expect(described_class.trace_errors_counter.get(type: :chunks_invalid_checksum)).to eq 1
expect(described_class.trace_errors_counter.get(type: :archive_invalid_checksum)).to eq 1
expect(described_class.trace_errors_counter.values.count).to eq 3
end
end
context 'when the operation type is known' do
it 'raises an exception' do
expect { subject.increment_error_counter(type: :invalid_type) }
.to raise_error(ArgumentError)
end
end
end
end
......@@ -112,6 +112,14 @@ RSpec.describe Ci::UpdateBuildStateService do
.not_to have_received(:increment_trace_operation)
.with(operation: :invalid)
end
it 'does not increment chunks_invalid_checksum trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
end
end
context 'when build trace has been migrated' do
......@@ -174,6 +182,14 @@ RSpec.describe Ci::UpdateBuildStateService do
.to have_received(:increment_trace_operation)
.with(operation: :invalid)
end
it 'increments chunks_invalid_checksum trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
end
end
context 'when trace checksum is valid' do
......@@ -191,6 +207,14 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end
context 'when using deprecated parameters' do
......@@ -208,6 +232,14 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end
end
end
......@@ -227,6 +259,14 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :corrupted)
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end
end
......@@ -242,9 +282,17 @@ RSpec.describe Ci::UpdateBuildStateService do
.to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :corrupted)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end
end
......@@ -325,6 +373,10 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics)
.not_to have_received(:increment_trace_operation)
.with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
end
context 'when build pending state is outdated' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment