Commit 7ced53bf authored by Dmitriy Zaporozhets (DZ)'s avatar Dmitriy Zaporozhets (DZ)

Merge branch 'refactor-stuck-service' into 'master'

Add Ci::StuckBuilds::DropService to hold logic needed for use in StuckCiJobsWorker

See merge request gitlab-org/gitlab!69564
parents 3b0134f8 d68d5d88
# frozen_string_literal: true
module Ci
module StuckBuilds
class DropService
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days
def execute
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds"
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop(
pending_builds(BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck(
pending_builds(BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
end
private
# rubocop: disable CodeReuse/ActiveRecord
# We're adding the ordering clause by `created_at` and `project_id`
# because we want to force the query planner to use the
# `ci_builds_gitlab_monitor_metrics` index all the time.
def pending_builds(timeout)
if Feature.enabled?(:ci_new_query_for_pending_stuck_jobs)
Ci::Build.pending.created_at_before(timeout).updated_at_before(timeout).order(created_at: :asc, project_id: :asc)
else
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: timeout)
end
end
# rubocop: enable CodeReuse/ActiveRecord
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
def drop(builds, failure_reason:)
fetch(builds) do |build|
drop_build :outdated, build, failure_reason
end
end
def drop_stuck(builds, failure_reason:)
fetch(builds) do |build|
break unless build.stuck?
drop_build :stuck, build, failure_reason
end
end
# rubocop: disable CodeReuse/ActiveRecord
def fetch(builds)
loop do
jobs = builds.includes(:tags, :runner, project: [:namespace, :route])
.limit(100)
.to_a
break if jobs.empty?
jobs.each do |job|
Gitlab::ApplicationContext.with_context(project: job.project) { yield(job) }
end
end
end
# rubocop: enable CodeReuse/ActiveRecord
def drop_build(type, build, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{build.status}, failure_reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
rescue StandardError => ex
build.doom!
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::ErrorTracking.track_exception(ex,
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
)
end
end
end
end
...@@ -3,72 +3,29 @@ ...@@ -3,72 +3,29 @@
class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
include ApplicationWorker include ApplicationWorker
data_consistency :always # rubocop:disable Scalability/CronWorkerContext
# This is an instance-wide cleanup query, so there's no meaningful
# scope to consider this in the context of.
include CronjobQueue include CronjobQueue
# rubocop:enable Scalability/CronWorkerContext
data_consistency :always
feature_category :continuous_integration feature_category :continuous_integration
worker_resource_boundary :cpu worker_resource_boundary :cpu
EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease' EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_SCHEDULED_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
BUILD_LOOKBACK = 5.days
def perform def perform
return unless try_obtain_lease return unless try_obtain_lease
Gitlab::AppLogger.info "#{self.class}: Cleaning stuck builds" Ci::StuckBuilds::DropService.new.execute
drop(running_timed_out_builds, failure_reason: :stuck_or_timeout_failure)
drop(
pending_builds(BUILD_PENDING_OUTDATED_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
drop(scheduled_timed_out_builds, failure_reason: :stale_schedule)
drop_stuck(
pending_builds(BUILD_PENDING_STUCK_TIMEOUT.ago),
failure_reason: :stuck_or_timeout_failure
)
remove_lease remove_lease
end end
private private
# rubocop: disable CodeReuse/ActiveRecord
# We're adding the ordering clause by `created_at` and `project_id`
# because we want to force the query planner to use the
# `ci_builds_gitlab_monitor_metrics` index all the time.
def pending_builds(timeout)
if Feature.enabled?(:ci_new_query_for_pending_stuck_jobs)
Ci::Build.pending.created_at_before(timeout).updated_at_before(timeout).order(created_at: :asc, project_id: :asc)
else
Ci::Build.pending.updated_before(lookback: BUILD_LOOKBACK.ago, timeout: timeout)
end
end
# rubocop: enable CodeReuse/ActiveRecord
def scheduled_timed_out_builds
Ci::Build.where(status: :scheduled).where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.scheduled_at IS NOT NULL AND ci_builds.scheduled_at < ?',
BUILD_SCHEDULED_OUTDATED_TIMEOUT.ago
)
end
def running_timed_out_builds
Ci::Build.running.where( # rubocop: disable CodeReuse/ActiveRecord
'ci_builds.updated_at < ?',
BUILD_RUNNING_OUTDATED_TIMEOUT.ago
)
end
def try_obtain_lease def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain @uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end end
...@@ -76,55 +33,4 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker ...@@ -76,55 +33,4 @@ class StuckCiJobsWorker # rubocop:disable Scalability/IdempotentWorker
def remove_lease def remove_lease
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid) Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end end
def drop(builds, failure_reason:)
fetch(builds) do |build|
drop_build :outdated, build, failure_reason
end
end
def drop_stuck(builds, failure_reason:)
fetch(builds) do |build|
break unless build.stuck?
drop_build :stuck, build, failure_reason
end
end
# rubocop: disable CodeReuse/ActiveRecord
def fetch(builds)
loop do
jobs = builds.includes(:tags, :runner, project: [:namespace, :route])
.limit(100)
.to_a
break if jobs.empty?
jobs.each do |job|
with_context(project: job.project) { yield(job) }
end
end
end
# rubocop: enable CodeReuse/ActiveRecord
def drop_build(type, build, reason)
Gitlab::AppLogger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{build.status}, failure_reason: #{reason})"
Gitlab::OptimisticLocking.retry_lock(build, 3, name: 'stuck_ci_jobs_worker_drop_build') do |b|
b.drop(reason)
end
rescue StandardError => ex
build.doom!
track_exception_for_build(ex, build)
end
def track_exception_for_build(ex, build)
Gitlab::ErrorTracking.track_exception(ex,
build_id: build.id,
build_name: build.name,
build_stage: build.stage,
pipeline_id: build.pipeline_id,
project_id: build.project_id
)
end
end end
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Ci::StuckBuilds::DropService do
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:created_at) { }
let(:updated_at) { }
subject(:service) { described_class.new }
before do
job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end
shared_examples 'job is dropped' do
it 'changes status' do
expect(service).to receive(:drop).exactly(3).times.and_call_original
expect(service).to receive(:drop_stuck).exactly(:once).and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure
end
context 'when job have data integrity problem' do
it "does drop the job and logs the reason" do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::ErrorTracking).to receive(:track_exception)
.with(anything, a_hash_including(build_id: job.id))
.once
.and_call_original
service.execute
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
end
end
end
shared_examples 'job is unchanged' do
it 'does not change status' do
expect(service).to receive(:drop).exactly(3).times.and_call_original
expect(service).to receive(:drop_stuck).exactly(:once).and_call_original
service.execute
job.reload
expect(job.status).to eq(status)
end
end
context 'when job is pending' do
let(:status) { 'pending' }
context 'when job is not stuck' do
before do
allow_next_found_instance_of(Ci::Build) do |build|
allow(build).to receive(:stuck?).and_return(false)
end
end
context 'when job was updated_at more than 1 day ago' do
let(:updated_at) { 1.5.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated less than 1 day ago' do
let(:updated_at) { 6.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'when job is stuck' do
before do
allow_next_found_instance_of(Ci::Build) do |build|
allow(build).to receive(:stuck?).and_return(true)
end
end
context 'when job was updated_at more than 1 hour ago' do
let(:updated_at) { 1.5.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled).each do |status|
context "when job is #{status}" do
let(:status) { status }
let(:updated_at) { 2.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it_behaves_like 'job is dropped'
end
describe 'drop stale scheduled builds' do
let(:status) { 'scheduled' }
let(:updated_at) { }
context 'when scheduled at 2 hours ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 2.hours.ago) }
it 'drops the stale scheduled build' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
service.execute
job.reload
expect(Ci::Build.scheduled.count).to eq(0)
expect(job).to be_failed
expect(job).to be_stale_schedule
end
end
context 'when scheduled at 30 minutes ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 30.minutes.ago) }
it 'does not drop the stale scheduled build yet' do
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
service.execute
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
end
end
context 'when there are no stale scheduled builds' do
it 'does not drop the stale scheduled build yet' do
expect { service.execute }.not_to raise_error
end
end
end
end
...@@ -5,311 +5,50 @@ require 'spec_helper' ...@@ -5,311 +5,50 @@ require 'spec_helper'
RSpec.describe StuckCiJobsWorker do RSpec.describe StuckCiJobsWorker do
include ExclusiveLeaseHelpers include ExclusiveLeaseHelpers
let!(:runner) { create :ci_runner } let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let!(:job) { create :ci_build, runner: runner }
let(:worker_lease_key) { StuckCiJobsWorker::EXCLUSIVE_LEASE_KEY }
let(:worker_lease_uuid) { SecureRandom.uuid } let(:worker_lease_uuid) { SecureRandom.uuid }
let(:created_at) { } let(:worker2) { described_class.new }
let(:updated_at) { }
subject(:worker) { described_class.new } subject(:worker) { described_class.new }
before do before do
stub_exclusive_lease(worker_lease_key, worker_lease_uuid) stub_exclusive_lease(worker_lease_key, worker_lease_uuid)
job_attributes = { status: status }
job_attributes[:created_at] = created_at if created_at
job_attributes[:updated_at] = updated_at if updated_at
job.update!(job_attributes)
end end
shared_examples 'job is dropped' do describe '#perform' do
it "changes status" do it 'executes an instance of Ci::StuckBuildsDropService' do
worker.perform expect_next_instance_of(Ci::StuckBuilds::DropService) do |service|
job.reload expect(service).to receive(:execute).exactly(:once)
expect(job).to be_failed
expect(job).to be_stuck_or_timeout_failure
end
context 'when job have data integrity problem' do
it "does drop the job and logs the reason" do
job.update_columns(yaml_variables: '[{"key" => "value"}]')
expect(Gitlab::ErrorTracking).to receive(:track_exception)
.with(anything, a_hash_including(build_id: job.id))
.once
.and_call_original
worker.perform
job.reload
expect(job).to be_failed
expect(job).to be_data_integrity_failure
end end
end
end
shared_examples 'job is unchanged' do
before do
worker.perform worker.perform
job.reload
end end
it "doesn't change status" do context 'with an exclusive lease' do
expect(job.status).to eq(status) it 'does not execute concurrently' do
end expect(worker).to receive(:remove_lease).exactly(:once)
end expect(worker2).not_to receive(:remove_lease)
context 'when job is pending' do
let(:status) { 'pending' }
context 'when job is not stuck' do
before do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false)
end
context 'when job was updated_at more than 1 day ago' do
let(:updated_at) { 1.5.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated less than 1 day ago' do
let(:updated_at) { 6.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
context 'when job is stuck' do
before do
allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true)
end
context 'when job was updated_at more than 1 hour ago' do
let(:updated_at) { 1.5.hours.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 1.5.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is dropped'
end
context 'when created_at is outside lookback window' do
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do worker.perform
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged'
end
end
end
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was updated_at more than an hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled).each do |status|
context "when job is #{status}" do
let(:status) { status }
let(:updated_at) { 2.days.ago }
context 'when created_at is the same as updated_at' do
let(:created_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is before updated_at' do
let(:created_at) { 3.days.ago }
it_behaves_like 'job is unchanged'
end
context 'when created_at is outside lookback window' do stub_exclusive_lease_taken(worker_lease_key)
let(:created_at) { described_class::BUILD_LOOKBACK - 1.day }
it_behaves_like 'job is unchanged' worker2.perform
end end
end
end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before do
job.project.update!(pending_delete: true)
end
it 'does drop job' do
expect_any_instance_of(Ci::Build).to receive(:drop).and_call_original
worker.perform
end
end
describe 'drop stale scheduled builds' do
let(:status) { 'scheduled' }
let(:updated_at) { }
context 'when scheduled at 2 hours ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 2.hours.ago) }
it 'drops the stale scheduled build' do it 'can execute in sequence' do
expect(Ci::Build.scheduled.count).to eq(1) expect(worker).to receive(:remove_lease).at_least(:once)
expect(job).to be_scheduled expect(worker2).to receive(:remove_lease).at_least(:once)
worker.perform worker.perform
job.reload worker2.perform
expect(Ci::Build.scheduled.count).to eq(0)
expect(job).to be_failed
expect(job).to be_stale_schedule
end end
end
context 'when scheduled at 30 minutes ago but it is not executed yet' do
let!(:job) { create(:ci_build, :scheduled, scheduled_at: 30.minutes.ago) }
it 'does not drop the stale scheduled build yet' do it 'cancels exclusive leases after worker perform' do
expect(Ci::Build.scheduled.count).to eq(1) expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
expect(job).to be_scheduled
worker.perform worker.perform
expect(Ci::Build.scheduled.count).to eq(1)
expect(job).to be_scheduled
end
end
context 'when there are no stale scheduled builds' do
it 'does not drop the stale scheduled build yet' do
expect { worker.perform }.not_to raise_error
end end
end end
end end
describe 'exclusive lease' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
let(:worker2) { described_class.new }
it 'is guard by exclusive lease when executed concurrently' do
expect(worker).to receive(:drop).at_least(:once).and_call_original
expect(worker2).not_to receive(:drop)
worker.perform
stub_exclusive_lease_taken(worker_lease_key)
worker2.perform
end
it 'can be executed in sequence' do
expect(worker).to receive(:drop).at_least(:once).and_call_original
expect(worker2).to receive(:drop).at_least(:once).and_call_original
worker.perform
worker2.perform
end
it 'cancels exclusive leases after worker perform' do
expect_to_cancel_exclusive_lease(worker_lease_key, worker_lease_uuid)
worker.perform
end
end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment