Be more vigorous in killing workers when shutting down sidekiq-cluster

In https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/8511 and https://gitlab.com/gitlab-com/gl-infra/production/issues/1309 we have seen cases where sidekiq-cluster has been asked to restart, but it has left one or more sidekiq worker processes in a stuck/hung state burning CPU. With this, sidekiq-cluster now waits for the worker processes to terminate themselves cleanly (and push jobs back onto the queue) and another few seconds beyond, then if any processes remain, kills them hard (:KILL signal) before exiting as usual (allowing whater process monitor is in place to restart sidekiq-cluster)

Be more vigorous in killing workers when shutting down sidekiq-cluster
In https://gitlab.com/gitlab-com/gl-infra/infrastructure/issues/8511 and https://gitlab.com/gitlab-com/gl-infra/production/issues/1309 we have seen cases where sidekiq-cluster has been asked to restart, but it has left one or more sidekiq worker processes in a stuck/hung state burning CPU. With this, sidekiq-cluster now waits for the worker processes to terminate themselves cleanly (and push jobs back onto the queue) and another few seconds beyond, then if any processes remain, kills them hard (:KILL signal) before exiting as usual (allowing whater process monitor is in place to restart sidekiq-cluster)
9dd56719 · Craig Miskell · Sean McGivern · 16363f28 · 9dd56719 · 9dd56719
Commit 9dd56719 authored Dec 16, 2019 by Craig Miskell Committed by Sean McGivern Jan 09, 2020
4 changed files
--- a/ee/lib/gitlab/sidekiq_cluster.rb
+++ b/ee/lib/gitlab/sidekiq_cluster.rb
@@ -143,6 +143,14 @@ module Gitlab
      true
    end

+    def self.any_alive?(pids)
+      pids_alive(pids).any?
+    end
+
+    def self.pids_alive(pids)
+      pids.select { |pid| signal(pid, 0) }
+    end
+
    def self.write_pid(path)
      File.open(path, 'w') do |handle|
        handle.write(Process.pid.to_s)

--- a/ee/lib/gitlab/sidekiq_cluster/cli.rb
+++ b/ee/lib/gitlab/sidekiq_cluster/cli.rb
@@ -7,6 +7,12 @@ require 'time'
 module Gitlab
  module SidekiqCluster
    class CLI
+      # How often to check for processes when terminating
+      CHECK_TERMINATE_INTERVAL_SECONDS = 1
+      # How long to wait in total when asking for a clean termination
+      # Sidekiq default to self-terminate is 25s
+      TERMINATE_TIMEOUT_SECONDS = 30
+
      CommandError = Class.new(StandardError)

      def initialize(log_output = STDERR)
@@ -63,10 +69,19 @@ module Gitlab
        SidekiqCluster.write_pid(@pid) if @pid
      end

+      def wait_for_termination(check_interval = CHECK_TERMINATE_INTERVAL_SECONDS, terminate_timeout = TERMINATE_TIMEOUT_SECONDS)
+        deadline = Gitlab::Metrics::System.monotonic_time + terminate_timeout
+        sleep(check_interval) while SidekiqCluster.any_alive?(@processes) && Gitlab::Metrics::System.monotonic_time < deadline
+
+        # Hard-stop any that remain; they're likely stuck
+        SidekiqCluster.signal_processes(SidekiqCluster.pids_alive(@processes), :KILL)
+      end
+
      def trap_signals
        SidekiqCluster.trap_terminate do |signal|
          @alive = false
          SidekiqCluster.signal_processes(@processes, signal)
+          wait_for_termination
        end

        SidekiqCluster.trap_forward do |signal|

--- a/ee/spec/lib/gitlab/sidekiq_cluster/cli_spec.rb
+++ b/ee/spec/lib/gitlab/sidekiq_cluster/cli_spec.rb
@@ -84,6 +84,51 @@ describe Gitlab::SidekiqCluster::CLI do
    end
  end

+  describe '#wait_for_termination' do
+    it 'waits for termination of all sub-processes and succeeds after 3 checks' do
+      expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
+        .with(an_instance_of(Array)).and_return(true, true, true, false)
+
+      expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
+        .with([]).and_return([])
+
+      expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
+        .with([], :KILL)
+
+      # Check every 0.1s, for no more than 1 second total
+      cli.wait_for_termination(0.1, 1)
+    end
+
+    context 'with hanging workers' do
+      before do
+        expect(cli).to receive(:write_pid)
+        expect(cli).to receive(:trap_signals)
+        expect(cli).to receive(:start_loop)
+      end
+
+      it 'hard kills workers after timeout expires' do
+        worker_pids = [101, 102, 103]
+        expect(Gitlab::SidekiqCluster).to receive(:start)
+                                            .with([['foo']], default_options)
+                                            .and_return(worker_pids)
+
+        expect(Gitlab::SidekiqCluster).to receive(:any_alive?)
+          .with(worker_pids).and_return(true).at_least(10).times
+
+        expect(Gitlab::SidekiqCluster).to receive(:pids_alive)
+          .with(worker_pids).and_return([102])
+
+        expect(Gitlab::SidekiqCluster).to receive(:signal_processes)
+          .with([102], :KILL)
+
+        cli.run(%w(foo))
+
+        # Check every 0.1s, for no more than 1 second total
+        cli.wait_for_termination(0.1, 1)
+      end
+    end
+  end
+
  describe '#trap_signals' do
    it 'traps the termination and forwarding signals' do
      expect(Gitlab::SidekiqCluster).to receive(:trap_terminate)

--- a/ee/spec/lib/gitlab/sidekiq_cluster_spec.rb
+++ b/ee/spec/lib/gitlab/sidekiq_cluster_spec.rb
@@ -146,6 +146,26 @@ describe Gitlab::SidekiqCluster do
    end
  end

+  describe '.any_alive?' do
+    it 'returns true if at least one process is alive' do
+      processes = [1, 2]
+
+      allow(described_class).to receive(:signal).with(1, 0).and_return(true)
+      allow(described_class).to receive(:signal).with(2, 0).and_return(false)
+
+      expect(described_class.any_alive?(processes)).to eq(true)
+    end
+
+    it 'returns false when all threads are dead' do
+      processes = [1, 2]
+
+      allow(described_class).to receive(:signal).with(1, 0).and_return(false)
+      allow(described_class).to receive(:signal).with(2, 0).and_return(false)
+
+      expect(described_class.any_alive?(processes)).to eq(false)
+    end
+  end
+
  describe '.write_pid' do
    it 'writes the PID of the current process to the given file' do
      handle = double(:handle)