Commit 8a6af784 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'feature/gb/add-build-trace-chunk-migration-redundancy-metric' into 'master'

Reduce noise created by duplicated trace migration workers

See merge request gitlab-org/gitlab!45128
parents fc2a5ab2 3fcc83a5
......@@ -136,6 +136,10 @@ module Ci
# We are using optimistic locking combined with Redis locking to ensure
# that a chunk gets migrated properly.
#
# We are catching an exception related to an exclusive lock not being
# acquired because it is creating a lot of noise, and is a result of
# duplicated workers running in parallel for the same build trace chunk.
#
def persist_data!
in_lock(*lock_params) do # exclusive Redis lock is acquired first
raise FailedToPersistDataError, 'Modifed build trace chunk detected' if has_changes_to_save?
......@@ -144,6 +148,8 @@ module Ci
chunk.unsafe_persist_data! # we migrate the data and update data store
end
end
rescue FailedToObtainLockError
metrics.increment_trace_operation(operation: :stalled)
rescue ActiveRecord::StaleObjectError
raise FailedToPersistDataError, <<~MSG
Data migration race condition detected
......
......@@ -6,9 +6,20 @@ module Gitlab
class Metrics
extend Gitlab::Utils::StrongMemoize
OPERATIONS = [:appended, :streamed, :chunked, :mutated, :overwrite,
:accepted, :finalized, :discarded, :conflict, :locked,
:invalid].freeze
OPERATIONS = [
:appended, # new trace data has been written to a chunk
:streamed, # new trace data has been sent by a runner
:chunked, # new trace chunk has been created
:mutated, # trace has been mutated when removing secrets
:overwrite, # runner requested overwritting a build trace
:accepted, # scheduled chunks for migration and responded with 202
:finalized, # all live build trace chunks have been persisted
:discarded, # failed to persist live chunks before timeout
:conflict, # runner has sent unrecognized build state details
:locked, # build trace has been locked by a different mechanism
:stalled, # failed to migrate chunk due to a worker duplication
:invalid # malformed build trace has been detected using CRC32
].freeze
def increment_trace_operation(operation: :unknown)
unless OPERATIONS.include?(operation)
......
......@@ -508,20 +508,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
subject { build_trace_chunk.persist_data! }
shared_examples_for 'Atomic operation' do
context 'when the other process is persisting' do
let(:lease_key) { "trace_write:#{build_trace_chunk.build.id}:chunks:#{build_trace_chunk.chunk_index}" }
before do
stub_exclusive_lease_taken(lease_key)
end
it 'raise an error' do
expect { subject }.to raise_error('Failed to obtain a lock')
end
end
end
context 'when data_store is redis' do
let(:data_store) { :redis }
......@@ -552,8 +538,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(build_trace_chunk.reload.checksum).to eq '3398914352'
end
it_behaves_like 'Atomic operation'
end
context 'when data size has not reached CHUNK_SIZE' do
......@@ -606,6 +590,35 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
/Modifed build trace chunk detected/)
end
end
context 'when the chunk is being locked by a different worker' do
let(:metrics) { spy('metrics') }
it 'does not raise an exception' do
lock_chunk do
expect { build_trace_chunk.persist_data! }.not_to raise_error
end
end
it 'increments stalled chunk trace metric' do
allow(build_trace_chunk)
.to receive(:metrics)
.and_return(metrics)
lock_chunk { build_trace_chunk.persist_data! }
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :stalled)
.once
end
def lock_chunk(&block)
"trace_write:#{build.id}:chunks:#{chunk_index}".then do |key|
build_trace_chunk.in_lock(key, &block)
end
end
end
end
end
......@@ -640,8 +653,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil
expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to eq(data)
end
it_behaves_like 'Atomic operation'
end
context 'when data size has not reached CHUNK_SIZE' do
......@@ -701,8 +712,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil
expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to eq(data)
end
it_behaves_like 'Atomic operation'
end
context 'when data size has not reached CHUNK_SIZE' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment