Commit c2e0e689 authored by Shinya Maeda's avatar Shinya Maeda

Validate the existence of archived traces before removing live trace

Often live traces are removed even though the archived trace
doesn't exist. This commit checkes the existence strictly.
parent 1a3fda63
...@@ -531,6 +531,14 @@ module Ci ...@@ -531,6 +531,14 @@ module Ci
trace.exist? trace.exist?
end end
def has_live_trace?
trace.live_trace_exist?
end
def has_archived_trace?
trace.archived_trace_exist?
end
def artifacts_file def artifacts_file
job_artifacts_archive&.file job_artifacts_archive&.file
end end
......
...@@ -176,6 +176,10 @@ module Ci ...@@ -176,6 +176,10 @@ module Ci
end end
end end
def self.archived_trace_exists_for?(job_id)
where(job_id: job_id).trace.take&.file&.file&.exists?
end
private private
def file_format_adapter_class def file_format_adapter_class
......
...@@ -2,8 +2,25 @@ ...@@ -2,8 +2,25 @@
module Ci module Ci
class ArchiveTraceService class ArchiveTraceService
def execute(job) def execute(job, worker_name:)
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_live_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
return
end
job.trace.archive! job.trace.archive!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless job.has_archived_trace?
Sidekiq.logger.warn(class: worker_name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
end
rescue ::Gitlab::Ci::Trace::AlreadyArchivedError rescue ::Gitlab::Ci::Trace::AlreadyArchivedError
# It's already archived, thus we can safely ignore this exception. # It's already archived, thus we can safely ignore this exception.
rescue => e rescue => e
...@@ -11,7 +28,7 @@ module Ci ...@@ -11,7 +28,7 @@ module Ci
# If `archive!` keeps failing for over a week, that could incur data loss. # If `archive!` keeps failing for over a week, that could incur data loss.
# (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture) # (See more https://docs.gitlab.com/ee/administration/job_traces.html#new-live-trace-architecture)
# In order to avoid interrupting the system, we do not raise an exception here. # In order to avoid interrupting the system, we do not raise an exception here.
archive_error(e, job) archive_error(e, job, worker_name)
end end
private private
...@@ -22,9 +39,12 @@ module Ci ...@@ -22,9 +39,12 @@ module Ci
"Counter of failed attempts of trace archiving") "Counter of failed attempts of trace archiving")
end end
def archive_error(error, job) def archive_error(error, job, worker_name)
failed_archive_counter.increment failed_archive_counter.increment
Rails.logger.error "Failed to archive trace. id: #{job.id} message: #{error.message}" # rubocop:disable Gitlab/RailsLogger
Sidekiq.logger.warn(class: worker_name,
message: "Failed to archive trace. message: #{error.message}.",
job_id: job.id)
Gitlab::Sentry Gitlab::Sentry
.track_exception(error, .track_exception(error,
......
...@@ -7,7 +7,7 @@ class ArchiveTraceWorker ...@@ -7,7 +7,7 @@ class ArchiveTraceWorker
# rubocop: disable CodeReuse/ActiveRecord # rubocop: disable CodeReuse/ActiveRecord
def perform(job_id) def perform(job_id)
Ci::Build.without_archived_trace.find_by(id: job_id).try do |job| Ci::Build.without_archived_trace.find_by(id: job_id).try do |job|
Ci::ArchiveTraceService.new.execute(job) Ci::ArchiveTraceService.new.execute(job, worker_name: self.class.name)
end end
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
......
...@@ -11,7 +11,7 @@ module Ci ...@@ -11,7 +11,7 @@ module Ci
# This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL # This could happen when ArchiveTraceWorker sidekiq jobs were lost by receiving SIGKILL
# More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791 # More details in https://gitlab.com/gitlab-org/gitlab-ce/issues/36791
Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build| Ci::Build.finished.with_live_trace.find_each(batch_size: 100) do |build|
Ci::ArchiveTraceService.new.execute(build) Ci::ArchiveTraceService.new.execute(build, worker_name: self.class.name)
end end
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
......
---
title: Extra logging for new live trace architecture
merge_request: 30892
author:
type: fixed
...@@ -63,7 +63,15 @@ module Gitlab ...@@ -63,7 +63,15 @@ module Gitlab
end end
def exist? def exist?
trace_artifact&.exists? || job.trace_chunks.any? || current_path.present? || old_trace.present? archived_trace_exist? || live_trace_exist?
end
def archived_trace_exist?
trace_artifact&.exists?
end
def live_trace_exist?
job.trace_chunks.any? || current_path.present? || old_trace.present?
end end
def read def read
...@@ -167,7 +175,7 @@ module Gitlab ...@@ -167,7 +175,7 @@ module Gitlab
def clone_file!(src_stream, temp_dir) def clone_file!(src_stream, temp_dir)
FileUtils.mkdir_p(temp_dir) FileUtils.mkdir_p(temp_dir)
Dir.mktmpdir('tmp-trace', temp_dir) do |dir_path| Dir.mktmpdir("tmp-trace-#{job.id}", temp_dir) do |dir_path|
temp_path = File.join(dir_path, "job.log") temp_path = File.join(dir_path, "job.log")
FileUtils.touch(temp_path) FileUtils.touch(temp_path)
size = IO.copy_stream(src_stream, temp_path) size = IO.copy_stream(src_stream, temp_path)
......
...@@ -166,6 +166,13 @@ module Gitlab ...@@ -166,6 +166,13 @@ module Gitlab
end end
def destroy! def destroy!
# TODO: Remove this logging once we confirmed new live trace architecture is functional.
# See https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/4667.
unless build.has_archived_trace?
Sidekiq.logger.warn(message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id)
end
trace_chunks.fast_destroy_all trace_chunks.fast_destroy_all
@tell = @size = 0 @tell = @size = 0
ensure ensure
......
...@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do ...@@ -442,5 +442,15 @@ describe Gitlab::Ci::Trace::ChunkedIO, :clean_gitlab_redis_cache do
expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0) expect(Ci::BuildTraceChunk.where(build: build).count).to eq(0)
end end
context 'when the job does not have archived trace' do
it 'leaves a message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
message: 'The job does not have archived trace but going to be destroyed.',
job_id: build.id).and_call_original
subject
end
end
end end
end end
...@@ -692,6 +692,34 @@ describe Ci::Build do ...@@ -692,6 +692,34 @@ describe Ci::Build do
end end
end end
describe '#has_live_trace?' do
subject { build.has_live_trace? }
let(:build) { create(:ci_build, :trace_live) }
it { is_expected.to be_truthy }
context 'when build does not have live trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_archived_trace?' do
subject { build.has_archived_trace? }
let(:build) { create(:ci_build, :trace_artifact) }
it { is_expected.to be_truthy }
context 'when build does not have archived trace' do
let(:build) { create(:ci_build) }
it { is_expected.to be_falsy }
end
end
describe '#has_job_artifacts?' do describe '#has_job_artifacts?' do
subject { build.has_job_artifacts? } subject { build.has_job_artifacts? }
......
...@@ -70,6 +70,31 @@ describe Ci::JobArtifact do ...@@ -70,6 +70,31 @@ describe Ci::JobArtifact do
end end
end end
describe '.archived_trace_exists_for?' do
subject { described_class.archived_trace_exists_for?(job_id) }
let!(:artifact) { create(:ci_job_artifact, :trace, job: job) }
let(:job) { create(:ci_build) }
context 'when the specified job_id exists' do
let(:job_id) { job.id }
it { is_expected.to be_truthy }
context 'when the job does have archived trace' do
let!(:artifact) { }
it { is_expected.to be_falsy }
end
end
context 'when the specified job_id does not exist' do
let(:job_id) { 10000 }
it { is_expected.to be_falsy }
end
end
describe 'callbacks' do describe 'callbacks' do
subject { create(:ci_job_artifact, :archive) } subject { create(:ci_job_artifact, :archive) }
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
require 'spec_helper' require 'spec_helper'
describe Ci::ArchiveTraceService, '#execute' do describe Ci::ArchiveTraceService, '#execute' do
subject { described_class.new.execute(job) } subject { described_class.new.execute(job, worker_name: ArchiveTraceWorker.name) }
context 'when job is finished' do context 'when job is finished' do
let(:job) { create(:ci_build, :success, :trace_live) } let(:job) { create(:ci_build, :success, :trace_live) }
...@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do ...@@ -25,6 +25,34 @@ describe Ci::ArchiveTraceService, '#execute' do
expect { subject }.not_to change { Ci::JobArtifact.trace.count } expect { subject }.not_to change { Ci::JobArtifact.trace.count }
end end
end end
context 'when job does not have trace' do
let(:job) { create(:ci_build, :success) }
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have live trace but going to be archived.',
job_id: job.id)
subject
end
end
context 'when job failed to archive trace but did not raise an exception' do
before do
allow_any_instance_of(Gitlab::Ci::Trace).to receive(:archive!) {}
end
it 'leaves a warning message in sidekiq log' do
expect(Sidekiq.logger).to receive(:warn).with(
class: ArchiveTraceWorker.name,
message: 'The job does not have archived trace after archiving.',
job_id: job.id)
subject
end
end
end end
context 'when job is running' do context 'when job is running' do
...@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do ...@@ -37,10 +65,10 @@ describe Ci::ArchiveTraceService, '#execute' do
issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502', issue_url: 'https://gitlab.com/gitlab-org/gitlab-ce/issues/51502',
extra: { job_id: job.id } ).once extra: { job_id: job.id } ).once
expect(Rails.logger) expect(Sidekiq.logger).to receive(:warn).with(
.to receive(:error) class: ArchiveTraceWorker.name,
.with("Failed to archive trace. id: #{job.id} message: Job is not finished yet") message: "Failed to archive trace. message: Job is not finished yet.",
.and_call_original job_id: job.id).and_call_original
expect(Gitlab::Metrics) expect(Gitlab::Metrics)
.to receive(:counter) .to receive(:counter)
......
...@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do ...@@ -720,6 +720,58 @@ shared_examples_for 'trace with enabled live trace feature' do
end end
end end
describe '#archived_trace_exist?' do
subject { trace.archived_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_truthy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_falsy }
end
end
describe '#live_trace_exist?' do
subject { trace.live_trace_exist? }
context 'when trace does not exist' do
it { is_expected.to be_falsy }
end
context 'when archived trace exists' do
before do
create(:ci_job_artifact, :trace, job: build)
end
it { is_expected.to be_falsy }
end
context 'when live trace exists' do
before do
Gitlab::Ci::Trace::ChunkedIO.new(build) do |stream|
stream.write('abc')
end
end
it { is_expected.to be_truthy }
end
end
describe '#archive!' do describe '#archive!' do
subject { trace.archive! } subject { trace.archive! }
......
...@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do ...@@ -11,7 +11,7 @@ describe ArchiveTraceWorker do
it 'executes service' do it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService) expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(job) .to receive(:execute).with(job, anything)
subject subject
end end
......
...@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do ...@@ -34,7 +34,7 @@ describe Ci::ArchiveTracesCronWorker do
it 'executes service' do it 'executes service' do
expect_any_instance_of(Ci::ArchiveTraceService) expect_any_instance_of(Ci::ArchiveTraceService)
.to receive(:execute).with(build) .to receive(:execute).with(build, anything)
subject subject
end end
...@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do ...@@ -60,7 +60,10 @@ describe Ci::ArchiveTracesCronWorker do
end end
it 'puts a log' do it 'puts a log' do
expect(Rails.logger).to receive(:error).with("Failed to archive trace. id: #{build.id} message: Unexpected error") expect(Sidekiq.logger).to receive(:warn).with(
class: described_class.name,
message: "Failed to archive trace. message: Unexpected error.",
job_id: build.id)
subject subject
end end
......
...@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do ...@@ -7,8 +7,6 @@ describe StuckCiJobsWorker do
let!(:runner) { create :ci_runner } let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner } let!(:job) { create :ci_build, runner: runner }
let(:trace_lease_key) { "trace:write:lock:#{job.id}" }
let(:trace_lease_uuid) { SecureRandom.uuid }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY } let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid } let(:worker_lease_uuid) { SecureRandom.uuid }
...@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do ...@@ -16,7 +14,6 @@ describe StuckCiJobsWorker do
before do before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid) stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
stub_exclusive_lease(trace_lease_key, trace_lease_uuid)
job.update!(status: status, updated_at: updated_at) job.update!(status: status, updated_at: updated_at)
end end
...@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do ...@@ -195,7 +192,6 @@ describe StuckCiJobsWorker do
end end
it 'cancels exclusive leases after worker perform' do it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(trace_lease_key, trace_lease_uuid)
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid) expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform worker.perform
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment