Commit 7ced53bf authored by Dmitriy Zaporozhets (DZ)'s avatar Dmitriy Zaporozhets (DZ)

Merge branch 'refactor-stuck-service' into 'master'

Add Ci::StuckBuilds::DropService to hold logic needed for use in StuckCiJobsWorker

See merge request gitlab-org/gitlab!69564
parents 3b0134f8 d68d5d88
# frozen_string_literal: true
module Ci
module StuckBuilds
class DropService
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days
def execute
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds"
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop(
pending_builds(BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck(
pending_builds(BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
end
private
# rubocop: disable CodeReuse/ActiveRecord
# We're adding the ordering clause by `created_at` and `project_id`
# because we want to force the query planner to use the
# `ci_builds_gitlab_monitor_metrics` index all the time.
def pending_builds(timeout)
if Feature.enabled?(:ci_new_query_for_pending_stuck_jobs)
Ci::Build.pending.created_at_before(timeout).updated_at_before(timeout).order(created_at: :asc, project_id: :asc)
else
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: timeout)
end
end
# rubocop: enable CodeReuse/ActiveRecord
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
def drop(builds, failure_reason:)
fetch(builds) do |build|
drop_build :outdated, build, failure_reason
end
end
def drop_stuck(builds, failure_reason:)
fetch(builds) do |build|
break unless build.stuck?
drop_build :stuck, build, failure_reason
end
end
# rubocop: disable CodeReuse/ActiveRecord
def fetch(builds)
loop do
jobs = builds.includes(:tags, :runner, project: [:namespace, :route])
.limit(100)
.to_a
break if jobs.empty?
jobs.each do |job|
Gitlab::ApplicationContext.with_context(project: job.project) { yield(job) }
end
end
end
# rubocop: enable CodeReuse/ActiveRecord
def drop_build(type, build, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{build.status}, failure_reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
rescue StandardError => ex
build.doom!
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::ErrorTracking.track_exception(ex,
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
)
end
end
end
end
......@@ -3,72 +3,29 @@
class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
include ApplicationWorker
data_consistency :always
# rubocop:disable Scalability/CronWorkerContext
# This is an instance-wide cleanup query, so there's no meaningful
# scope to consider this in the context of.
include CronjobQueue
# rubocop:enable Scalability/CronWorkerContext
data_consistency :always
feature_category :continuous_integration
worker_resource_boundary :cpu
EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days
def perform
return unless try_obtain_lease
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds"
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop(
pending_builds(BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck(
pending_builds(BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
Ci::StuckBuilds::DropService.new.execute
remove_lease
end
private
# rubocop: disable CodeReuse/ActiveRecord
# We're adding the ordering clause by `created_at` and `project_id`
# because we want to force the query planner to use the
# `ci_builds_gitlab_monitor_metrics` index all the time.
def pending_builds(timeout)
if Feature.enabled?(:ci_new_query_for_pending_stuck_jobs)
Ci::Build.pending.created_at_before(timeout).updated_at_before(timeout).order(created_at: :asc, project_id: :asc)
else
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: timeout)
end
end
# rubocop: enable CodeReuse/ActiveRecord
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end
......@@ -76,55 +33,4 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
def remove_lease
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end
def drop(builds, failure_reason:)
fetch(builds) do |build|
drop_build :outdated, build, failure_reason
end
end
def drop_stuck(builds, failure_reason:)
fetch(builds) do |build|
break unless build.stuck?
drop_build :stuck, build, failure_reason
end
end
# rubocop: disable CodeReuse/ActiveRecord
def fetch(builds)
loop do
jobs = builds.includes(:tags, :runner, project: [:namespace, :route])
.limit(100)
.to_a
break if jobs.empty?
jobs.each do |job|
with_context(project: job.project) { yield(job) }
end
end
end
# rubocop: enable CodeReuse/ActiveRecord
def drop_build(type, build, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{build.status}, failure_reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
rescue StandardError => ex
build.doom!
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::ErrorTracking.track_exception(ex,
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
)
end
end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::StuckBuilds::DropService do
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:created_at) { }
let(:updated_at) { }
subject(:service) { described_class.new }
before do
job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end
shared_examples 'job is dropped' do
it 'changes status' do
expect(service).to receive(:drop).exactly(3).times.and_call_original
expect(service).to receive(:drop_stuck).exactly(:once).and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure
end
context 'when job have data integrity problem' do
it "does drop the job and logs the reason" do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::ErrorTracking).to receive(:track_exception)
.with(anything, a_hash_including(build_id: job.id))
.once
.and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
end
end
end
shared_examples 'job is unchanged' do
it 'does not change status' do
expect(service).to receive(:drop).exactly(3).times.and_call_original
expect(service).to receive(:drop_stuck).exactly(:once).and_call_original
service.execute
job.reload
expect(job.status).to eq(status)
end
end
context 'when job is pending' do
let(:status) { 'pending' }
context 'when job is not stuck' do
before do
allow_next_found_instance_of(Ci::Build) do |build|
allow(build).to receive(:stuck?).and_return(false)
end
end
context 'when job was updated_at more than 1 day ago' do
let(:updated_at) { 1.5.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated less than 1 day ago' do
let(:updated_at) { 6.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'when job is stuck' do
before do
allow_next_found_instance_of(Ci::Build) do |build|
allow(build).to receive(:stuck?).and_return(true)
end
end
context 'when job was updated_at more than 1 hour ago' do
let(:updated_at) { 1.5.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled).each do |status|
context "when job is #{status}" do
let(:status) { status }
let(:updated_at) { 2.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it_behaves_like 'job is dropped'
end
describe 'drop stale scheduled builds' do
let(:status) { 'scheduled' }
let(:updated_at) { }
context 'when scheduled at 2 hours ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 2.hours.ago) }
it 'drops the stale scheduled build' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
service.execute
job.reload
expect(Ci::Build.scheduled.count).to eq(0)
expect(job).to be_failed
expect(job).to be_stale_schedule
end
end
context 'when scheduled at 30 minutes ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 30.minutes.ago) }
it 'does not drop the stale scheduled build yet' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
service.execute
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
end
end
context 'when there are no stale scheduled builds' do
it 'does not drop the stale scheduled build yet' do
expect { service.execute }.not_to raise_error
end
end
end
end
......@@ -5,311 +5,50 @@ require 'spec_helper'
RSpec.describe StuckCiJobsWorker do
include ExclusiveLeaseHelpers
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid }
let(:created_at) { }
let(:updated_at) { }
let(:worker2) { described_class.new }
subject(:worker) { described_class.new }
before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end
shared_examples 'job is dropped' do
it "changes status" do
worker.perform
job.reload
expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure
end
context 'when job have data integrity problem' do
it "does drop the job and logs the reason" do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::ErrorTracking).to receive(:track_exception)
.with(anything, a_hash_including(build_id: job.id))
.once
.and_call_original
worker.perform
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
describe '#perform' do
it 'executes an instance of Ci::StuckBuildsDropService' do
expect_next_instance_of(Ci::StuckBuilds::DropService) do |service|
expect(service).to receive(:execute).exactly(:once)
end
end
end
shared_examples 'job is unchanged' do
before do
worker.perform
job.reload
end
it "doesn't change status" do
expect(job.status).to eq(status)
end
end
context 'when job is pending' do
let(:status) { 'pending' }
context 'when job is not stuck' do
before do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false)
end
context 'when job was updated_at more than 1 day ago' do
let(:updated_at) { 1.5.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated less than 1 day ago' do
let(:updated_at) { 6.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'when job is stuck' do
before do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true)
end
context 'when job was updated_at more than 1 hour ago' do
let(:updated_at) { 1.5.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'with an exclusive lease' do
it 'does not execute concurrently' do
expect(worker).to receive(:remove_lease).exactly(:once)
expect(worker2).not_to receive(:remove_lease)
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled).each do |status|
context "when job is #{status}" do
let(:status) { status }
let(:updated_at) { 2.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
worker.perform
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
stub_exclusive_lease_taken(worker_lease_key)
it_behaves_like 'job is unchanged'
worker2.perform
end
end
end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it 'does drop job' do
expect_any_instance_of(Ci::Build).to receive(:drop).and_call_original
worker.perform
end
end
describe 'drop stale scheduled builds' do
let(:status) { 'scheduled' }
let(:updated_at) { }
context 'when scheduled at 2 hours ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 2.hours.ago) }
it 'drops the stale scheduled build' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
it 'can execute in sequence' do
expect(worker).to receive(:remove_lease).at_least(:once)
expect(worker2).to receive(:remove_lease).at_least(:once)
worker.perform
job.reload
expect(Ci::Build.scheduled.count).to eq(0)
expect(job).to be_failed
expect(job).to be_stale_schedule
worker2.perform
end
end
context 'when scheduled at 30 minutes ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 30.minutes.ago) }
it 'does not drop the stale scheduled build yet' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
end
end
context 'when there are no stale scheduled builds' do
it 'does not drop the stale scheduled build yet' do
expect { worker.perform }.not_to raise_error
end
end
end
describe 'exclusive lease' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
let(:worker2) { described_class.new }
it 'is guard by exclusive lease when executed concurrently' do
expect(worker).to receive(:drop).at_least(:once).and_call_original
expect(worker2).not_to receive(:drop)
worker.perform
stub_exclusive_lease_taken(worker_lease_key)
worker2.perform
end
it 'can be executed in sequence' do
expect(worker).to receive(:drop).at_least(:once).and_call_original
expect(worker2).to receive(:drop).at_least(:once).and_call_original
worker.perform
worker2.perform
end
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment