Commit 34f3d899 authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch '27523-make-stuck-build-detection-more-performant' into 'master'

Make stuck builds detection more performant

Closes #27523

See merge request !9025
parents b63c41e1 4d4e99a2
class StuckCiBuildsWorker
include Sidekiq::Worker
include CronjobQueue
BUILD_STUCK_TIMEOUT = 1.day
def perform
Rails.logger.info 'Cleaning stuck builds'
builds = Ci::Build.joins(:project).running_or_pending.where('ci_builds.updated_at < ?', BUILD_STUCK_TIMEOUT.ago)
builds.find_each(batch_size: 50).each do |build|
Rails.logger.debug "Dropping stuck #{build.status} build #{build.id} for runner #{build.runner_id}"
build.drop
end
# Update builds that failed to drop
builds.update_all(status: 'failed')
end
end
class StuckCiJobsWorker
include Sidekiq::Worker
include CronjobQueue
EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'.freeze
BUILD_RUNNING_OUTDATED_TIMEOUT = 1.hour
BUILD_PENDING_OUTDATED_TIMEOUT = 1.day
BUILD_PENDING_STUCK_TIMEOUT = 1.hour
def perform
return unless try_obtain_lease
Rails.logger.info "#{self.class}: Cleaning stuck builds"
drop :running, BUILD_RUNNING_OUTDATED_TIMEOUT
drop :pending, BUILD_PENDING_OUTDATED_TIMEOUT
drop_stuck :pending, BUILD_PENDING_STUCK_TIMEOUT
remove_lease
end
private
def try_obtain_lease
@uuid = Gitlab::ExclusiveLease.new(EXCLUSIVE_LEASE_KEY, timeout: 30.minutes).try_obtain
end
def remove_lease
Gitlab::ExclusiveLease.cancel(EXCLUSIVE_LEASE_KEY, @uuid)
end
def drop(status, timeout)
search(status, timeout) do |build|
drop_build :outdated, build, status, timeout
end
end
def drop_stuck(status, timeout)
search(status, timeout) do |build|
return unless build.stuck?
drop_build :stuck, build, status, timeout
end
end
def search(status, timeout)
builds = Ci::Build.where(status: status).where('ci_builds.updated_at < ?', timeout.ago)
builds.joins(:project).includes(:tags, :runner, project: :namespace).find_each(batch_size: 50).each do |build|
yield(build)
end
end
def drop_build(type, build, status, timeout)
Rails.logger.info "#{self.class}: Dropping #{type} build #{build.id} for runner #{build.runner_id} (status: #{status}, timeout: #{timeout})"
Gitlab::OptimisticLocking.retry_lock(build, 3) do |b|
b.drop
end
end
end
---
title: Make stuck builds detection more performant
merge_request: 9025
author:
......@@ -177,9 +177,9 @@ production: &base
# Periodically executed jobs, to self-heal Gitlab, do external synchronizations, etc.
# Please read here for more information: https://github.com/ondrejbartas/sidekiq-cron#adding-cron-job
cron_jobs:
# Flag stuck CI builds as failed
stuck_ci_builds_worker:
cron: "0 0 * * *"
# Flag stuck CI jobs as failed
stuck_ci_jobs_worker:
cron: "0 * * * *"
# Remove expired build artifacts
expire_build_artifacts_worker:
cron: "50 * * * *"
......
......@@ -308,9 +308,9 @@ Settings.gravatar['host'] = Settings.host_without_www(Settings.gravatar[
# Cron Jobs
#
Settings['cron_jobs'] ||= Settingslogic.new({})
Settings.cron_jobs['stuck_ci_builds_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['stuck_ci_builds_worker']['cron'] ||= '0 0 * * *'
Settings.cron_jobs['stuck_ci_builds_worker']['job_class'] = 'StuckCiBuildsWorker'
Settings.cron_jobs['stuck_ci_jobs_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['stuck_ci_jobs_worker']['cron'] ||= '0 * * * *'
Settings.cron_jobs['stuck_ci_jobs_worker']['job_class'] = 'StuckCiJobsWorker'
Settings.cron_jobs['expire_build_artifacts_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['expire_build_artifacts_worker']['cron'] ||= '50 * * * *'
Settings.cron_jobs['expire_build_artifacts_worker']['job_class'] = 'ExpireBuildArtifactsWorker'
......
require "spec_helper"
describe StuckCiBuildsWorker do
let!(:build) { create :ci_build }
let(:worker) { described_class.new }
subject do
build.reload
build.status
end
%w(pending running).each do |status|
context "#{status} build" do
before do
build.update!(status: status)
end
it 'gets dropped if it was updated over 2 days ago' do
build.update!(updated_at: 2.days.ago)
worker.perform
is_expected.to eq('failed')
end
it "is still #{status}" do
build.update!(updated_at: 1.minute.ago)
worker.perform
is_expected.to eq(status)
end
end
end
%w(success failed canceled).each do |status|
context "#{status} build" do
before do
build.update!(status: status)
end
it "is still #{status}" do
build.update!(updated_at: 2.days.ago)
worker.perform
is_expected.to eq(status)
end
end
end
context "for deleted project" do
before do
build.update!(status: :running, updated_at: 2.days.ago)
build.project.update(pending_delete: true)
end
it "does not drop build" do
expect_any_instance_of(Ci::Build).not_to receive(:drop)
worker.perform
end
end
end
require 'spec_helper'
describe StuckCiJobsWorker do
let!(:runner) { create :ci_runner }
let!(:job) { create :ci_build, runner: runner }
let(:worker) { described_class.new }
let(:exclusive_lease_uuid) { SecureRandom.uuid }
subject do
job.reload
job.status
end
before do
job.update!(status: status, updated_at: updated_at)
allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(exclusive_lease_uuid)
end
shared_examples 'job is dropped' do
it 'changes status' do
worker.perform
is_expected.to eq('failed')
end
end
shared_examples 'job is unchanged' do
it "doesn't change status" do
worker.perform
is_expected.to eq(status)
end
end
context 'when job is pending' do
let(:status) { 'pending' }
context 'when job is not stuck' do
before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(false) }
context 'when job was not updated for more than 1 day ago' do
let(:updated_at) { 2.days.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 day ago' do
let(:updated_at) { 6.hours.ago }
it_behaves_like 'job is unchanged'
end
context 'when job was not updated for more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is unchanged'
end
end
context 'when job is stuck' do
before { allow_any_instance_of(Ci::Build).to receive(:stuck?).and_return(true) }
context 'when job was not updated for more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
end
context 'when job is running' do
let(:status) { 'running' }
context 'when job was not updated for more than 1 hour ago' do
let(:updated_at) { 2.hours.ago }
it_behaves_like 'job is dropped'
end
context 'when job was updated in less than 1 hour ago' do
let(:updated_at) { 30.minutes.ago }
it_behaves_like 'job is unchanged'
end
end
%w(success skipped failed canceled).each do |status|
context "when job is #{status}" do
let(:status) { status }
let(:updated_at) { 2.days.ago }
it_behaves_like 'job is unchanged'
end
end
context 'for deleted project' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
before { job.project.update(pending_delete: true) }
it 'does not drop job' do
expect_any_instance_of(Ci::Build).not_to receive(:drop)
worker.perform
end
end
describe 'exclusive lease' do
let(:status) { 'running' }
let(:updated_at) { 2.days.ago }
let(:worker2) { described_class.new }
it 'is guard by exclusive lease when executed concurrently' do
expect(worker).to receive(:drop).at_least(:once)
expect(worker2).not_to receive(:drop)
worker.perform
allow_any_instance_of(Gitlab::ExclusiveLease).to receive(:try_obtain).and_return(false)
worker2.perform
end
it 'can be executed in sequence' do
expect(worker).to receive(:drop).at_least(:once)
expect(worker2).to receive(:drop).at_least(:once)
worker.perform
worker2.perform
end
it 'cancels exclusive lease after worker perform' do
expect(Gitlab::ExclusiveLease).to receive(:cancel).with(described_class::EXCLUSIVE_LEASE_KEY, exclusive_lease_uuid)
worker.perform
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment