Commit 1311393d authored by Sean McGivern's avatar Sean McGivern

Merge branch 'sidekiq-cluster-terminate-hung-workers' into 'master'

Be more vigorous killing workers when shutting down sidekiq-cluster

See merge request gitlab-org/gitlab!21796
parents 58938101 7dc1959f
---
title: When sidekiq-cluster is asked to shutdown, actively terminate any sidekiq processes that don't finish cleanly in short order
merge_request: 21796
author:
type: fixed
......@@ -137,12 +137,26 @@ module Gitlab
# Returns true if all the processes are alive.
def self.all_alive?(pids)
pids.each do |pid|
return false unless signal(pid, 0)
return false unless process_alive?(pid)
end
true
end
def self.any_alive?(pids)
pids_alive(pids).any?
end
def self.pids_alive(pids)
pids.select { |pid| process_alive?(pid) }
end
def self.process_alive?(pid)
# Signal 0 tests whether the process exists and we have access to send signals
# but is otherwise a noop (doesn't actually send a signal to the process)
signal(pid, 0)
end
def self.write_pid(path)
File.open(path, 'w') do |handle|
handle.write(Process.pid.to_s)
......
......@@ -7,6 +7,11 @@ require 'time'
module Gitlab
module SidekiqCluster
class CLI
CHECK_TERMINATE_INTERVAL_SECONDS = 1
# How long to wait in total when asking for a clean termination
# Sidekiq default to self-terminate is 25s
TERMINATE_TIMEOUT_SECONDS = 30
CommandError = Class.new(StandardError)
def initialize(log_output = STDERR)
......@@ -63,10 +68,30 @@ module Gitlab
SidekiqCluster.write_pid(@pid) if @pid
end
def monotonic_time
Process.clock_gettime(Process::CLOCK_MONOTONIC, :float_second)
end
def continue_waiting?(deadline)
SidekiqCluster.any_alive?(@processes) && monotonic_time < deadline
end
def hard_stop_stuck_pids
SidekiqCluster.signal_processes(SidekiqCluster.pids_alive(@processes), :KILL)
end
def wait_for_termination
deadline = monotonic_time + TERMINATE_TIMEOUT_SECONDS
sleep(CHECK_TERMINATE_INTERVAL_SECONDS) while continue_waiting?(deadline)
hard_stop_stuck_pids
end
def trap_signals
SidekiqCluster.trap_terminate do |signal|
@alive = false
SidekiqCluster.signal_processes(@processes, signal)
wait_for_termination
end
SidekiqCluster.trap_forward do |signal|
......
......@@ -84,6 +84,53 @@ describe Gitlab::SidekiqCluster::CLI do
end
end
describe '#wait_for_termination' do
it 'waits for termination of all sub-processes and succeeds after 3 checks' do
expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
.with(an_instance_of(Array)).and_return(true, true, true, false)
expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
.with([]).and_return([])
expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
.with([], :KILL)
stub_const("Gitlab::SidekiqCluster::CLI::CHECK_TERMINATE_INTERVAL_SECONDS", 0.1)
stub_const("Gitlab::SidekiqCluster::CLI::TERMINATE_TIMEOUT_SECONDS", 1)
cli.wait_for_termination
end
context 'with hanging workers' do
before do
expect(cli).to receive(:write_pid)
expect(cli).to receive(:trap_signals)
expect(cli).to receive(:start_loop)
end
it 'hard kills workers after timeout expires' do
worker_pids = [101, 102, 103]
expect(Gitlab::SidekiqCluster).to receive(:start)
.with([['foo']], default_options)
.and_return(worker_pids)
expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
.with(worker_pids).and_return(true).at_least(10).times
expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
.with(worker_pids).and_return([102])
expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
.with([102], :KILL)
cli.run(%w(foo))
stub_const("Gitlab::SidekiqCluster::CLI::CHECK_TERMINATE_INTERVAL_SECONDS", 0.1)
stub_const("Gitlab::SidekiqCluster::CLI::TERMINATE_TIMEOUT_SECONDS", 1)
cli.wait_for_termination
end
end
end
describe '#trap_signals' do
it 'traps the termination and forwarding signals' do
expect(Gitlab::SidekiqCluster).to receive(:trap_terminate)
......
......@@ -128,24 +128,38 @@ describe Gitlab::SidekiqCluster do
end
end
# In the X_alive? checks, we check negative PIDs sometimes as a simple way
# to be sure the pids are definitely for non-existent processes.
# Note that -1 is special, and sends the signal to every process we have permission
# for, so we use -2, -3 etc
describe '.all_alive?' do
it 'returns true if all processes are alive' do
processes = [1]
allow(described_class).to receive(:signal).with(1, 0).and_return(true)
processes = [Process.pid]
expect(described_class.all_alive?(processes)).to eq(true)
end
it 'returns false when a thread was not alive' do
processes = [1]
allow(described_class).to receive(:signal).with(1, 0).and_return(false)
processes = [-2]
expect(described_class.all_alive?(processes)).to eq(false)
end
end
describe '.any_alive?' do
it 'returns true if at least one process is alive' do
processes = [Process.pid, -2]
expect(described_class.any_alive?(processes)).to eq(true)
end
it 'returns false when all threads are dead' do
processes = [-2, -3]
expect(described_class.any_alive?(processes)).to eq(false)
end
end
describe '.write_pid' do
it 'writes the PID of the current process to the given file' do
handle = double(:handle)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment