Commit 197b1c3e authored by Fabio Pitino's avatar Fabio Pitino

Merge branch '259619-add-metric-for-tracking-invalid-traces' into 'master'

Add metrics to track invalid job traces

See merge request gitlab-org/gitlab!72001
parents dee08e12 3bda0e58
...@@ -73,9 +73,11 @@ module Ci ...@@ -73,9 +73,11 @@ module Ci
::Gitlab::Ci::Trace::Checksum.new(build).then do |checksum| ::Gitlab::Ci::Trace::Checksum.new(build).then do |checksum|
unless checksum.valid? unless checksum.valid?
metrics.increment_trace_operation(operation: :invalid) metrics.increment_trace_operation(operation: :invalid)
metrics.increment_error_counter(type: :chunks_invalid_checksum)
if checksum.corrupted? if checksum.corrupted?
metrics.increment_trace_operation(operation: :corrupted) metrics.increment_trace_operation(operation: :corrupted)
metrics.increment_error_counter(type: :chunks_invalid_size)
end end
next unless log_invalid_chunks? next unless log_invalid_chunks?
......
...@@ -140,6 +140,7 @@ The following metrics are available: ...@@ -140,6 +140,7 @@ The following metrics are available:
| `gitlab_snowplow_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emitted | | | `gitlab_snowplow_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emitted | |
| `gitlab_snowplow_failed_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emission failures | | | `gitlab_snowplow_failed_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emission failures | |
| `gitlab_snowplow_successful_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emission successes | | | `gitlab_snowplow_successful_events_total` | Counter | 14.1 | Total number of GitLab Snowplow product intelligence events emission successes | |
| `gitlab_ci_build_trace_errors_total` | Counter | 14.4 | Total amount of different error types on a build trace | `type` |
## Metrics controlled by a feature flag ## Metrics controlled by a feature flag
......
...@@ -21,6 +21,12 @@ module Gitlab ...@@ -21,6 +21,12 @@ module Gitlab
:corrupted # malformed trace found after comparing CRC32 and size :corrupted # malformed trace found after comparing CRC32 and size
].freeze ].freeze
TRACE_ERROR_TYPES = [
:chunks_invalid_size, # used to be :corrupted
:chunks_invalid_checksum, # used to be :invalid
:archive_invalid_checksum # malformed trace found into object store after comparing MD5
].freeze
def increment_trace_operation(operation: :unknown) def increment_trace_operation(operation: :unknown)
unless OPERATIONS.include?(operation) unless OPERATIONS.include?(operation)
raise ArgumentError, "unknown trace operation: #{operation}" raise ArgumentError, "unknown trace operation: #{operation}"
...@@ -33,6 +39,14 @@ module Gitlab ...@@ -33,6 +39,14 @@ module Gitlab
self.class.trace_bytes.increment({}, size.to_i) self.class.trace_bytes.increment({}, size.to_i)
end end
def increment_error_counter(type: :unknown)
unless TRACE_ERROR_TYPES.include?(type)
raise ArgumentError, "unknown error type: #{type}"
end
self.class.trace_errors_counter.increment(type: type)
end
def observe_migration_duration(seconds) def observe_migration_duration(seconds)
self.class.finalize_histogram.observe({}, seconds.to_f) self.class.finalize_histogram.observe({}, seconds.to_f)
end end
...@@ -65,6 +79,15 @@ module Gitlab ...@@ -65,6 +79,15 @@ module Gitlab
::Gitlab::Metrics.histogram(name, comment, labels, buckets) ::Gitlab::Metrics.histogram(name, comment, labels, buckets)
end end
end end
def self.trace_errors_counter
strong_memoize(:trace_errors_counter) do
name = :gitlab_ci_build_trace_errors_total
comment = 'Total amount of different error types on a build trace'
Gitlab::Metrics.counter(name, comment)
end
end
end end
end end
end end
......
...@@ -15,4 +15,27 @@ RSpec.describe Gitlab::Ci::Trace::Metrics, :prometheus do ...@@ -15,4 +15,27 @@ RSpec.describe Gitlab::Ci::Trace::Metrics, :prometheus do
end end
end end
end end
describe '#increment_error_counter' do
context 'when the operation type is known' do
it 'increments the counter' do
subject.increment_error_counter(type: :chunks_invalid_size)
subject.increment_error_counter(type: :chunks_invalid_checksum)
subject.increment_error_counter(type: :archive_invalid_checksum)
expect(described_class.trace_errors_counter.get(type: :chunks_invalid_size)).to eq 1
expect(described_class.trace_errors_counter.get(type: :chunks_invalid_checksum)).to eq 1
expect(described_class.trace_errors_counter.get(type: :archive_invalid_checksum)).to eq 1
expect(described_class.trace_errors_counter.values.count).to eq 3
end
end
context 'when the operation type is known' do
it 'raises an exception' do
expect { subject.increment_error_counter(type: :invalid_type) }
.to raise_error(ArgumentError)
end
end
end
end end
...@@ -112,6 +112,14 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -112,6 +112,14 @@ RSpec.describe Ci::UpdateBuildStateService do
.not_to have_received(:increment_trace_operation) .not_to have_received(:increment_trace_operation)
.with(operation: :invalid) .with(operation: :invalid)
end end
it 'does not increment chunks_invalid_checksum trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
end
end end
context 'when build trace has been migrated' do context 'when build trace has been migrated' do
...@@ -174,6 +182,14 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -174,6 +182,14 @@ RSpec.describe Ci::UpdateBuildStateService do
.to have_received(:increment_trace_operation) .to have_received(:increment_trace_operation)
.with(operation: :invalid) .with(operation: :invalid)
end end
it 'increments chunks_invalid_checksum trace metric' do
execute_with_stubbed_metrics!
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
end
end end
context 'when trace checksum is valid' do context 'when trace checksum is valid' do
...@@ -191,6 +207,14 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -191,6 +207,14 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics) expect(metrics)
.not_to have_received(:increment_trace_operation) .not_to have_received(:increment_trace_operation)
.with(operation: :corrupted) .with(operation: :corrupted)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end end
context 'when using deprecated parameters' do context 'when using deprecated parameters' do
...@@ -208,6 +232,14 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -208,6 +232,14 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics) expect(metrics)
.not_to have_received(:increment_trace_operation) .not_to have_received(:increment_trace_operation)
.with(operation: :corrupted) .with(operation: :corrupted)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end end
end end
end end
...@@ -227,6 +259,14 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -227,6 +259,14 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics) expect(metrics)
.to have_received(:increment_trace_operation) .to have_received(:increment_trace_operation)
.with(operation: :corrupted) .with(operation: :corrupted)
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end end
end end
...@@ -242,9 +282,17 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -242,9 +282,17 @@ RSpec.describe Ci::UpdateBuildStateService do
.to have_received(:increment_trace_operation) .to have_received(:increment_trace_operation)
.with(operation: :invalid) .with(operation: :invalid)
expect(metrics)
.to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
expect(metrics) expect(metrics)
.not_to have_received(:increment_trace_operation) .not_to have_received(:increment_trace_operation)
.with(operation: :corrupted) .with(operation: :corrupted)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_size)
end end
end end
...@@ -325,6 +373,10 @@ RSpec.describe Ci::UpdateBuildStateService do ...@@ -325,6 +373,10 @@ RSpec.describe Ci::UpdateBuildStateService do
expect(metrics) expect(metrics)
.not_to have_received(:increment_trace_operation) .not_to have_received(:increment_trace_operation)
.with(operation: :invalid) .with(operation: :invalid)
expect(metrics)
.not_to have_received(:increment_error_counter)
.with(type: :chunks_invalid_checksum)
end end
context 'when build pending state is outdated' do context 'when build pending state is outdated' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment