Commit d9ff351f authored by Bob Van Landuyt's avatar Bob Van Landuyt

Merge branch '350548-puma-metrics-server-process' into 'master'

Use external metrics server process for Puma

See merge request gitlab-org/gitlab!82478
parents 5e7cba8a 31a3c142
# frozen_string_literal: true
PUMA_EXTERNAL_METRICS_SERVER = Gitlab::Utils.to_boolean(ENV['PUMA_EXTERNAL_METRICS_SERVER'])
# Keep separate directories for separate processes
def prometheus_default_multiproc_dir
return unless Rails.env.development? || Rails.env.test?
......@@ -72,9 +74,13 @@ Gitlab::Cluster::LifecycleEvents.on_master_start do
if Gitlab::Runtime.puma?
Gitlab::Metrics::Samplers::PumaSampler.instance.start
# Starts a metrics server to export metrics from the Puma primary.
if Settings.monitoring.web_exporter.enabled && PUMA_EXTERNAL_METRICS_SERVER
require_relative '../../metrics_server/metrics_server'
MetricsServer.start_for_puma
else
Gitlab::Metrics::Exporter::WebExporter.instance.start
end
end
Gitlab::Ci::Parsers.instrument!
rescue IOError => e
......@@ -90,11 +96,13 @@ Gitlab::Cluster::LifecycleEvents.on_worker_start do
Gitlab::Metrics::Samplers::ThreadsSampler.initialize_instance(logger: logger).start
if Gitlab::Runtime.puma?
# Since we are running a metrics server on the Puma primary, we would inherit
# this thread after forking into workers, so we need to explicitly stop it here.
# NOTE: This will not be necessary anymore after moving to an external server
# process via https://gitlab.com/gitlab-org/gitlab/-/issues/350548
# Since we are observing a metrics server from the Puma primary, we would inherit
# this supervision thread after forking into workers, so we need to explicitly stop it here.
if PUMA_EXTERNAL_METRICS_SERVER
::MetricsServer::PumaProcessSupervisor.instance.stop
else
Gitlab::Metrics::Exporter::WebExporter.instance.stop
end
Gitlab::Metrics::Samplers::ActionCableSampler.instance(logger: logger).start
end
......@@ -112,16 +120,24 @@ end
if Gitlab::Runtime.puma?
Gitlab::Cluster::LifecycleEvents.on_before_graceful_shutdown do
# We need to ensure that before we re-exec or shutdown server
# we do stop the exporter
# we also stop the metrics server
if PUMA_EXTERNAL_METRICS_SERVER
::MetricsServer::PumaProcessSupervisor.instance.shutdown
else
Gitlab::Metrics::Exporter::WebExporter.instance.stop
end
end
Gitlab::Cluster::LifecycleEvents.on_before_master_restart do
# We need to ensure that before we re-exec server
# we do stop the exporter
# we also stop the metrics server
#
# We do it again, for being extra safe,
# but it should not be needed
if PUMA_EXTERNAL_METRICS_SERVER
::MetricsServer::PumaProcessSupervisor.instance.shutdown
else
Gitlab::Metrics::Exporter::WebExporter.instance.stop
end
end
end
......@@ -9,7 +9,7 @@ module Gitlab
# The supervisor will also trap termination signals if provided and
# propagate those to the supervised processes. Any supervised processes
# that do not terminate within a specified grace period will be killed.
class ProcessSupervisor
class ProcessSupervisor < Gitlab::Daemon
DEFAULT_HEALTH_CHECK_INTERVAL_SECONDS = 5
DEFAULT_TERMINATE_INTERVAL_SECONDS = 1
DEFAULT_TERMINATE_TIMEOUT_SECONDS = 10
......@@ -21,13 +21,18 @@ module Gitlab
check_terminate_interval_seconds: DEFAULT_TERMINATE_INTERVAL_SECONDS,
terminate_timeout_seconds: DEFAULT_TERMINATE_TIMEOUT_SECONDS,
term_signals: %i(INT TERM),
forwarded_signals: [])
forwarded_signals: [],
**options)
super(**options)
@term_signals = term_signals
@forwarded_signals = forwarded_signals
@health_check_interval_seconds = health_check_interval_seconds
@check_terminate_interval_seconds = check_terminate_interval_seconds
@terminate_timeout_seconds = terminate_timeout_seconds
@pids = []
@alive = false
end
# Starts a supervision loop for the given process ID(s).
......@@ -39,34 +44,68 @@ module Gitlab
# start observing those processes instead. Otherwise it will shut down.
def supervise(pid_or_pids, &on_process_death)
@pids = Array(pid_or_pids)
@on_process_death = on_process_death
trap_signals!
start
end
# Shuts down the supervisor and all supervised processes with the given signal.
def shutdown(signal = :TERM)
return unless @alive
stop_processes(signal)
stop
end
def supervised_pids
@pids
end
private
def start_working
@alive = true
end
def stop_working
@alive = false
end
def run_thread
while @alive
sleep(@health_check_interval_seconds)
check_process_health(&on_process_death)
check_process_health
end
end
private
def check_process_health(&on_process_death)
def check_process_health
unless all_alive?
dead_pids = @pids - live_pids
@pids = Array(yield(dead_pids))
existing_pids = live_pids # Capture this value for the duration of the block.
dead_pids = @pids - existing_pids
new_pids = Array(@on_process_death.call(dead_pids))
@pids = existing_pids + new_pids
@alive = @pids.any?
end
end
def trap_signals!
ProcessManagement.trap_signals(@term_signals) do |signal|
def stop_processes(signal)
# Set this prior to shutting down so that shutdown hooks which read `alive`
# know the supervisor is about to shut down.
@alive = false
# Shut down supervised processes.
signal_all(signal)
wait_for_termination
end
def trap_signals!
ProcessManagement.trap_signals(@term_signals) do |signal|
stop_processes(signal)
end
ProcessManagement.trap_signals(@forwarded_signals) do |signal|
signal_all(signal)
end
......
......@@ -31,5 +31,6 @@ require_relative '../lib/gitlab/metrics/exporter/gc_request_middleware'
require_relative '../lib/gitlab/health_checks/probes/collection'
require_relative '../lib/gitlab/health_checks/probes/status'
require_relative '../lib/gitlab/process_management'
require_relative '../lib/gitlab/process_supervisor'
# rubocop:enable Naming/FileName
......@@ -5,7 +5,28 @@ require_relative '../config/boot'
require_relative 'dependencies'
class MetricsServer # rubocop:disable Gitlab/NamespacedClass
# The singleton instance used to supervise the Puma metrics server.
PumaProcessSupervisor = Class.new(Gitlab::ProcessSupervisor)
class << self
def start_for_puma
metrics_dir = ::Prometheus::Client.configuration.multiprocess_files_dir
start_server = proc do
MetricsServer.spawn('puma', metrics_dir: metrics_dir).tap do |pid|
Gitlab::AppLogger.info("Starting Puma metrics server with pid #{pid}")
end
end
supervisor = PumaProcessSupervisor.instance
supervisor.supervise(start_server.call) do
next unless supervisor.alive
Gitlab::AppLogger.info('Puma metrics server terminated, restarting...')
start_server.call
end
end
def spawn(target, metrics_dir:, gitlab_config: nil, wipe_metrics_dir: false)
ensure_valid_target!(target)
......
......@@ -10,6 +10,7 @@ require 'time'
# we may run into "already initialized" warnings, hence the check.
require_relative '../lib/gitlab' unless Object.const_defined?('Gitlab')
require_relative '../lib/gitlab/utils'
require_relative '../lib/gitlab/daemon'
require_relative '../lib/gitlab/sidekiq_config/cli_methods'
require_relative '../lib/gitlab/sidekiq_config/worker_matcher'
require_relative '../lib/gitlab/sidekiq_logging/json_formatter'
......@@ -121,11 +122,12 @@ module Gitlab
ProcessManagement.write_pid(@pid) if @pid
supervisor = Gitlab::ProcessSupervisor.new(
supervisor = SidekiqProcessSupervisor.instance(
health_check_interval_seconds: @interval,
terminate_timeout_seconds: @soft_timeout_seconds + TIMEOUT_GRACE_PERIOD_SECONDS,
term_signals: TERMINATE_SIGNALS,
forwarded_signals: FORWARD_SIGNALS
forwarded_signals: FORWARD_SIGNALS,
synchronous: true
)
metrics_server_pid = start_metrics_server
......@@ -136,7 +138,7 @@ module Gitlab
# If we're not in the process of shutting down the cluster,
# and the metrics server died, restart it.
if supervisor.alive && dead_pids.include?(metrics_server_pid)
@logger.info('Metrics server terminated, restarting...')
@logger.info('Sidekiq metrics server terminated, restarting...')
metrics_server_pid = restart_metrics_server(wipe_metrics_dir: false)
all_pids = worker_pids + Array(metrics_server_pid)
else
......
......@@ -16,6 +16,9 @@ module Gitlab
# before we kill the process.
TIMEOUT_GRACE_PERIOD_SECONDS = 5
# The singleton instance used to supervise cluster processes.
SidekiqProcessSupervisor = Class.new(Gitlab::ProcessSupervisor)
# Starts Sidekiq workers for the pairs of processes.
#
# Example:
......
......@@ -40,6 +40,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
}
end
let(:supervisor) { instance_double(Gitlab::SidekiqCluster::SidekiqProcessSupervisor) }
before do
stub_env('RAILS_ENV', 'test')
......@@ -47,8 +49,11 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
config_file.close
allow(::Settings).to receive(:source).and_return(config_file.path)
::Settings.reload!
allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow(Gitlab::SidekiqCluster::SidekiqProcessSupervisor).to receive(:instance).and_return(supervisor)
allow(supervisor).to receive(:supervise)
end
after do
......@@ -63,11 +68,6 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
end
context 'with arguments' do
before do
allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow_next_instance_of(Gitlab::ProcessSupervisor) { |it| allow(it).to receive(:supervise) }
end
it 'starts the Sidekiq workers' do
expect(Gitlab::SidekiqCluster).to receive(:start)
.with([['foo']], default_options)
......@@ -298,8 +298,6 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
context 'without --dryrun' do
before do
allow(Gitlab::SidekiqCluster).to receive(:start).and_return([])
allow(Gitlab::ProcessManagement).to receive(:write_pid)
allow_next_instance_of(Gitlab::ProcessSupervisor) { |it| allow(it).to receive(:supervise) }
end
context 'when there are no sidekiq_health_checks settings set' do
......@@ -429,14 +427,11 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
before do
allow(Gitlab::SidekiqCluster).to receive(:start).and_return(sidekiq_worker_pids)
allow(Gitlab::ProcessManagement).to receive(:write_pid)
end
it 'stops the entire process cluster if one of the workers has been terminated' do
allow_next_instance_of(Gitlab::ProcessSupervisor) do |it|
allow(it).to receive(:supervise).and_yield([2])
end
expect(supervisor).to receive(:alive).and_return(true)
expect(supervisor).to receive(:supervise).and_yield([2])
expect(MetricsServer).to receive(:fork).once.and_return(metrics_server_pid)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).with([42, 99], :TERM)
......@@ -445,11 +440,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
context 'when the supervisor is alive' do
it 'restarts the metrics server when it is down' do
allow_next_instance_of(Gitlab::ProcessSupervisor) do |it|
allow(it).to receive(:alive).and_return(true)
allow(it).to receive(:supervise).and_yield([metrics_server_pid])
end
expect(supervisor).to receive(:alive).and_return(true)
expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
expect(MetricsServer).to receive(:fork).twice.and_return(metrics_server_pid)
cli.run(%w(foo))
......@@ -458,11 +450,8 @@ RSpec.describe Gitlab::SidekiqCluster::CLI, stub_settings_source: true do # rubo
context 'when the supervisor is shutting down' do
it 'does not restart the metrics server' do
allow_next_instance_of(Gitlab::ProcessSupervisor) do |it|
allow(it).to receive(:alive).and_return(false)
allow(it).to receive(:supervise).and_yield([metrics_server_pid])
end
expect(supervisor).to receive(:alive).and_return(false)
expect(supervisor).to receive(:supervise).and_yield([metrics_server_pid])
expect(MetricsServer).to receive(:fork).once.and_return(metrics_server_pid)
cli.run(%w(foo))
......
......@@ -6,7 +6,9 @@ RSpec.describe Gitlab::ProcessSupervisor do
let(:health_check_interval_seconds) { 0.1 }
let(:check_terminate_interval_seconds) { 1 }
let(:forwarded_signals) { [] }
let(:process_id) do
let(:process_ids) { [spawn_process, spawn_process] }
def spawn_process
Process.spawn('while true; do sleep 1; done').tap do |pid|
Process.detach(pid)
end
......@@ -22,54 +24,52 @@ RSpec.describe Gitlab::ProcessSupervisor do
end
after do
if Gitlab::ProcessManagement.process_alive?(process_id)
Process.kill('KILL', process_id)
process_ids.each do |pid|
Process.kill('KILL', pid)
rescue Errno::ESRCH
# Ignore if a process wasn't actually alive.
end
end
describe '#supervise' do
context 'while supervised process is alive' do
context 'while supervised processes are alive' do
it 'does not invoke callback' do
expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true)
expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
pids_killed = []
thread = Thread.new do
supervisor.supervise(process_id) do |dead_pids|
supervisor.supervise(process_ids) do |dead_pids|
pids_killed = dead_pids
[]
end
end
# Wait several times the poll frequency of the supervisor.
sleep health_check_interval_seconds * 10
thread.terminate
expect(pids_killed).to be_empty
expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true)
expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
end
end
context 'when supervised process dies' do
it 'triggers callback with the dead PIDs' do
expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(true)
context 'when a supervised process dies' do
it 'triggers callback with the dead PIDs and adds new PIDs to supervised PIDs' do
expect(Gitlab::ProcessManagement.all_alive?(process_ids)).to be(true)
pids_killed = []
thread = Thread.new do
supervisor.supervise(process_id) do |dead_pids|
supervisor.supervise(process_ids) do |dead_pids|
pids_killed = dead_pids
[]
end
[42] # Fake starting a new process in place of the terminated one.
end
# Terminate the supervised process.
Process.kill('TERM', process_id)
Process.kill('TERM', process_ids.first)
await_condition(sleep_sec: health_check_interval_seconds) do
pids_killed == [process_id]
pids_killed == [process_ids.first]
end
thread.terminate
expect(Gitlab::ProcessManagement.process_alive?(process_id)).to be(false)
expect(Gitlab::ProcessManagement.process_alive?(process_ids.first)).to be(false)
expect(Gitlab::ProcessManagement.process_alive?(process_ids.last)).to be(true)
expect(supervisor.supervised_pids).to match_array([process_ids.last, 42])
end
end
......@@ -78,7 +78,7 @@ RSpec.describe Gitlab::ProcessSupervisor do
allow(supervisor).to receive(:sleep)
allow(Gitlab::ProcessManagement).to receive(:trap_signals)
allow(Gitlab::ProcessManagement).to receive(:all_alive?).and_return(false)
allow(Gitlab::ProcessManagement).to receive(:signal_processes).with([process_id], anything)
allow(Gitlab::ProcessManagement).to receive(:signal_processes).with(process_ids, anything)
end
context 'termination signals' do
......@@ -87,21 +87,21 @@ RSpec.describe Gitlab::ProcessSupervisor do
allow(Gitlab::ProcessManagement).to receive(:any_alive?).and_return(false)
expect(Gitlab::ProcessManagement).to receive(:trap_signals).ordered.with(%i(INT TERM)).and_yield(:TERM)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :TERM)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM)
expect(supervisor).not_to receive(:sleep).with(check_terminate_interval_seconds)
supervisor.supervise(process_id) { [] }
supervisor.supervise(process_ids) { [] }
end
end
context 'when TERM does not result in timely shutdown of processes' do
it 'issues a KILL signal after the grace period expires' do
expect(Gitlab::ProcessManagement).to receive(:trap_signals).with(%i(INT TERM)).and_yield(:TERM)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :TERM)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM)
expect(supervisor).to receive(:sleep).ordered.with(check_terminate_interval_seconds).at_least(:once)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], '-KILL')
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, '-KILL')
supervisor.supervise(process_id) { [] }
supervisor.supervise(process_ids) { [] }
end
end
end
......@@ -111,10 +111,53 @@ RSpec.describe Gitlab::ProcessSupervisor do
it 'forwards given signals to the observed processes' do
expect(Gitlab::ProcessManagement).to receive(:trap_signals).with(%i(USR1)).and_yield(:USR1)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with([process_id], :USR1)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :USR1)
supervisor.supervise(process_ids) { [] }
end
end
end
end
describe '#shutdown' do
context 'when supervisor is supervising processes' do
before do
supervisor.supervise(process_ids)
end
context 'when supervisor is alive' do
it 'signals TERM then KILL to all supervised processes' do
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, :TERM)
expect(Gitlab::ProcessManagement).to receive(:signal_processes).ordered.with(process_ids, '-KILL')
supervisor.supervise(process_id) { [] }
supervisor.shutdown
end
it 'stops the supervisor' do
expect { supervisor.shutdown }.to change { supervisor.alive }.from(true).to(false)
end
end
context 'when supervisor has already shut down' do
before do
supervisor.shutdown
end
it 'does nothing' do
expect(supervisor.alive).to be(false)
expect(Gitlab::ProcessManagement).not_to receive(:signal_processes)
supervisor.shutdown
end
end
end
context 'when supervisor never started' do
it 'does nothing' do
expect(supervisor.alive).to be(false)
expect(Gitlab::ProcessManagement).not_to receive(:signal_processes)
supervisor.shutdown
end
end
end
......
# frozen_string_literal: true
require 'fast_spec_helper'
require 'spec_helper'
require_relative '../../metrics_server/metrics_server'
require_relative '../support/helpers/next_instance_of'
RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
include NextInstanceOf
let(:prometheus_config) { ::Prometheus::Client.configuration }
let(:metrics_dir) { Dir.mktmpdir }
......@@ -205,4 +202,47 @@ RSpec.describe MetricsServer do # rubocop:disable RSpec/FilePath
it_behaves_like 'a metrics exporter', 'sidekiq', 'sidekiq_exporter'
end
describe '.start_for_puma' do
let(:supervisor) { instance_double(Gitlab::ProcessSupervisor) }
before do
allow(Gitlab::ProcessSupervisor).to receive(:instance).and_return(supervisor)
end
it 'spawns a server process and supervises it' do
expect(Process).to receive(:spawn).with(
include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
).once.and_return(42)
expect(supervisor).to receive(:supervise).with(42)
described_class.start_for_puma
end
context 'when the supervisor callback is invoked' do
context 'and the supervisor is alive' do
it 'restarts the metrics server' do
expect(supervisor).to receive(:alive).and_return(true)
expect(supervisor).to receive(:supervise).and_yield
expect(Process).to receive(:spawn).with(
include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
).twice.and_return(42)
described_class.start_for_puma
end
end
context 'and the supervisor is not alive' do
it 'does not restart the server' do
expect(supervisor).to receive(:alive).and_return(false)
expect(supervisor).to receive(:supervise).and_yield
expect(Process).to receive(:spawn).with(
include('METRICS_SERVER_TARGET' => 'puma'), end_with('bin/metrics-server'), anything
).once.and_return(42)
described_class.start_for_puma
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment