Commit 8a6af784 authored by Sean McGivern's avatar Sean McGivern

Merge branch 'feature/gb/add-build-trace-chunk-migration-redundancy-metric' into 'master'

Reduce noise created by duplicated trace migration workers

See merge request gitlab-org/gitlab!45128
parents fc2a5ab2 3fcc83a5
...@@ -136,6 +136,10 @@ module Ci ...@@ -136,6 +136,10 @@ module Ci
# We are using optimistic locking combined with Redis locking to ensure # We are using optimistic locking combined with Redis locking to ensure
# that a chunk gets migrated properly. # that a chunk gets migrated properly.
# #
# We are catching an exception related to an exclusive lock not being
# acquired because it is creating a lot of noise, and is a result of
# duplicated workers running in parallel for the same build trace chunk.
#
def persist_data! def persist_data!
in_lock(*lock_params) do # exclusive Redis lock is acquired first in_lock(*lock_params) do # exclusive Redis lock is acquired first
raise FailedToPersistDataError, 'Modifed build trace chunk detected' if has_changes_to_save? raise FailedToPersistDataError, 'Modifed build trace chunk detected' if has_changes_to_save?
...@@ -144,6 +148,8 @@ module Ci ...@@ -144,6 +148,8 @@ module Ci
chunk.unsafe_persist_data! # we migrate the data and update data store chunk.unsafe_persist_data! # we migrate the data and update data store
end end
end end
rescue FailedToObtainLockError
metrics.increment_trace_operation(operation: :stalled)
rescue ActiveRecord::StaleObjectError rescue ActiveRecord::StaleObjectError
raise FailedToPersistDataError, <<~MSG raise FailedToPersistDataError, <<~MSG
Data migration race condition detected Data migration race condition detected
......
...@@ -6,9 +6,20 @@ module Gitlab ...@@ -6,9 +6,20 @@ module Gitlab
class Metrics class Metrics
extend Gitlab::Utils::StrongMemoize extend Gitlab::Utils::StrongMemoize
OPERATIONS = [:appended, :streamed, :chunked, :mutated, :overwrite, OPERATIONS = [
:accepted, :finalized, :discarded, :conflict, :locked, :appended, # new trace data has been written to a chunk
:invalid].freeze :streamed, # new trace data has been sent by a runner
:chunked, # new trace chunk has been created
:mutated, # trace has been mutated when removing secrets
:overwrite, # runner requested overwritting a build trace
:accepted, # scheduled chunks for migration and responded with 202
:finalized, # all live build trace chunks have been persisted
:discarded, # failed to persist live chunks before timeout
:conflict, # runner has sent unrecognized build state details
:locked, # build trace has been locked by a different mechanism
:stalled, # failed to migrate chunk due to a worker duplication
:invalid # malformed build trace has been detected using CRC32
].freeze
def increment_trace_operation(operation: :unknown) def increment_trace_operation(operation: :unknown)
unless OPERATIONS.include?(operation) unless OPERATIONS.include?(operation)
......
...@@ -508,20 +508,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do ...@@ -508,20 +508,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
subject { build_trace_chunk.persist_data! } subject { build_trace_chunk.persist_data! }
shared_examples_for 'Atomic operation' do
context 'when the other process is persisting' do
let(:lease_key) { "trace_write:#{build_trace_chunk.build.id}:chunks:#{build_trace_chunk.chunk_index}" }
before do
stub_exclusive_lease_taken(lease_key)
end
it 'raise an error' do
expect { subject }.to raise_error('Failed to obtain a lock')
end
end
end
context 'when data_store is redis' do context 'when data_store is redis' do
let(:data_store) { :redis } let(:data_store) { :redis }
...@@ -552,8 +538,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do ...@@ -552,8 +538,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(build_trace_chunk.reload.checksum).to eq '3398914352' expect(build_trace_chunk.reload.checksum).to eq '3398914352'
end end
it_behaves_like 'Atomic operation'
end end
context 'when data size has not reached CHUNK_SIZE' do context 'when data size has not reached CHUNK_SIZE' do
...@@ -606,6 +590,35 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do ...@@ -606,6 +590,35 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
/Modifed build trace chunk detected/) /Modifed build trace chunk detected/)
end end
end end
context 'when the chunk is being locked by a different worker' do
let(:metrics) { spy('metrics') }
it 'does not raise an exception' do
lock_chunk do
expect { build_trace_chunk.persist_data! }.not_to raise_error
end
end
it 'increments stalled chunk trace metric' do
allow(build_trace_chunk)
.to receive(:metrics)
.and_return(metrics)
lock_chunk { build_trace_chunk.persist_data! }
expect(metrics)
.to have_received(:increment_trace_operation)
.with(operation: :stalled)
.once
end
def lock_chunk(&block)
"trace_write:#{build.id}:chunks:#{chunk_index}".then do |key|
build_trace_chunk.in_lock(key, &block)
end
end
end
end end
end end
...@@ -640,8 +653,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do ...@@ -640,8 +653,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil
expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to eq(data) expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to eq(data)
end end
it_behaves_like 'Atomic operation'
end end
context 'when data size has not reached CHUNK_SIZE' do context 'when data size has not reached CHUNK_SIZE' do
...@@ -701,8 +712,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do ...@@ -701,8 +712,6 @@ RSpec.describe Ci::BuildTraceChunk, :clean_gitlab_redis_shared_state do
expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil expect(Ci::BuildTraceChunks::Database.new.data(build_trace_chunk)).to be_nil
expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to eq(data) expect(Ci::BuildTraceChunks::Fog.new.data(build_trace_chunk)).to eq(data)
end end
it_behaves_like 'Atomic operation'
end end
context 'when data size has not reached CHUNK_SIZE' do context 'when data size has not reached CHUNK_SIZE' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment