Commit 938b891f authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch 'add-counter-for-trace-chunks' into 'master'

Improve traceability for failed attempts of archiving traces

Closes #51502

See merge request gitlab-org/gitlab-ce!21826
parents f6e51e83 9fa86977
# frozen_string_literal: true
module Ci
class ArchiveTraceService
def execute(job)
job.trace.archive!
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception.
rescue => e
# Tracks this error with application logs, Sentry, and Prometheus.
# If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
# In order to avoid interrupting the system, we do not raise an exception here.
archive_error(e, job)
end
private
def failed_archive_counter
@failed_archive_counter ||=
Gitlab::Metrics.counter(:job_trace_archive_failed_total,
"Counter of failed attempts of trace archiving")
end
def archive_error(error, job)
failed_archive_counter.increment
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}"
Gitlab::Sentry
.track_exception(error,
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id })
end
end
end
......@@ -7,7 +7,7 @@ class ArchiveTraceWorker
# rubocop: disable CodeReuse/ActiveRecord
def perform(job_id)
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
job.trace.archive!
Ci::ArchiveTraceService.new.execute(job)
end
end
# rubocop: enable CodeReuse/ActiveRecord
......
......@@ -11,21 +11,9 @@ module Ci
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
begin
build.trace.archive!
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
rescue => e
failed_archive_counter.increment
Rails.logger.error "Failed to archive stale live trace. id: #{build.id} message: #{e.message}"
end
Ci::ArchiveTraceService.new.execute(build)
end
end
# rubocop: enable CodeReuse/ActiveRecord
private
def failed_archive_counter
@failed_archive_counter ||= Gitlab::Metrics.counter(:job_trace_archive_failed_total, "Counter of failed attempts of traces archiving")
end
end
end
require 'spec_helper'
describe Ci::ArchiveTraceService, '#execute' do
subject { described_class.new.execute(job) }
context 'when job is finished' do
let(:job) { create(:ci_build, :success, :trace_live) }
it 'creates an archived trace' do
expect { subject }.not_to raise_error
expect(job.reload.job_artifacts_trace).to be_exist
end
context 'when trace is already archived' do
let!(:job) { create(:ci_build, :success, :trace_artifact) }
it 'ignores an exception' do
expect { subject }.not_to raise_error
end
it 'does not create an archived trace' do
expect { subject }.not_to change { Ci::JobArtifact.trace.count }
end
end
end
context 'when job is running' do
let(:job) { create(:ci_build, :running, :trace_live) }
it 'increments Prometheus counter, sends crash report to Sentry and ignore an error for continuing to archive' do
expect(Gitlab::Sentry)
.to receive(:track_exception)
.with(::Gitlab::Ci::Trace::ArchiveError,
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id } ).once
expect(Rails.logger)
.to receive(:error)
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet")
.and_call_original
expect(Gitlab::Metrics)
.to receive(:counter)
.with(:job_trace_archive_failed_total, "Counter of failed attempts of trace archiving")
.and_call_original
expect { subject }.not_to raise_error
end
end
end
......@@ -5,10 +5,11 @@ describe ArchiveTraceWorker do
subject { described_class.new.perform(job&.id) }
context 'when job is found' do
let(:job) { create(:ci_build) }
let(:job) { create(:ci_build, :trace_live) }
it 'executes service' do
expect_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!)
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(job)
subject
end
......@@ -18,7 +19,8 @@ describe ArchiveTraceWorker do
let(:job) { nil }
it 'does not execute service' do
expect_any_instance_of(Gitlab::Ci::Trace).not_to receive(:archive!)
expect_any_instance_of(Ci::ArchiveTraceService)
.not_to receive(:execute)
subject
end
......
......@@ -30,6 +30,13 @@ describe Ci::ArchiveTracesCronWorker do
it_behaves_like 'archives trace'
it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(build)
subject
end
context 'when a trace had already been archived' do
let!(:build) { create(:ci_build, :success, :trace_live, :trace_artifact) }
let!(:build2) { create(:ci_build, :success, :trace_live) }
......@@ -46,11 +53,12 @@ describe Ci::ArchiveTracesCronWorker do
let!(:build) { create(:ci_build, :success, :trace_live) }
before do
allow(Gitlab::Sentry).to receive(:track_exception)
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!).and_raise('Unexpected error')
end
it 'puts a log' do
expect(Rails.logger).to receive(:error).with("Failed to archive stale live trace. id: #{build.id} message: Unexpected error")
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error")
subject
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment