Commit 643d4fad authored by Stan Hu's avatar Stan Hu

Merge branch '59754-independent-sidekiq-memory-killer-ee' into 'master'

Add Sidekiq daemon memory killer

See merge request gitlab-org/gitlab!16900
parents 4d8ed0b1 0d179ebd
......@@ -28,16 +28,18 @@ if Rails.env.development?
end
enable_json_logs = Gitlab.config.sidekiq.log_format == 'json'
enable_sidekiq_monitor = ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero?
enable_sidekiq_memory_killer = ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS'].to_i.nonzero?
use_sidekiq_daemon_memory_killer = ENV["SIDEKIQ_DAEMON_MEMORY_KILLER"].to_i.nonzero?
use_sidekiq_legacy_memory_killer = !use_sidekiq_daemon_memory_killer
Sidekiq.configure_server do |config|
config.redis = queues_config_hash
config.server_middleware do |chain|
chain.add Gitlab::SidekiqMiddleware::Monitor if enable_sidekiq_monitor
chain.add Gitlab::SidekiqMiddleware::Monitor
chain.add Gitlab::SidekiqMiddleware::Metrics if Settings.monitoring.sidekiq_exporter
chain.add Gitlab::SidekiqMiddleware::ArgumentsLogger if ENV['SIDEKIQ_LOG_ARGUMENTS'] && !enable_json_logs
chain.add Gitlab::SidekiqMiddleware::MemoryKiller if ENV['SIDEKIQ_MEMORY_KILLER_MAX_RSS']
chain.add Gitlab::SidekiqMiddleware::MemoryKiller if enable_sidekiq_memory_killer && use_sidekiq_legacy_memory_killer
chain.add Gitlab::SidekiqMiddleware::RequestStoreMiddleware unless ENV['SIDEKIQ_REQUEST_STORE'] == '0'
chain.add Gitlab::SidekiqMiddleware::BatchLoader
chain.add Gitlab::SidekiqMiddleware::CorrelationLogger
......@@ -60,7 +62,11 @@ Sidekiq.configure_server do |config|
# Sidekiq (e.g. in an initializer).
ActiveRecord::Base.clear_all_connections!
Gitlab::SidekiqDaemon::Monitor.instance.start if enable_sidekiq_monitor
# Start monitor to track running jobs. By default, cancel job is not enabled
# To cancel job, it requires `SIDEKIQ_MONITOR_WORKER=1` to enable notification channel
Gitlab::SidekiqDaemon::Monitor.instance.start
Gitlab::SidekiqDaemon::MemoryKiller.instance.start if enable_sidekiq_memory_killer && use_sidekiq_daemon_memory_killer
end
if enable_reliable_fetch?
......
......@@ -26,18 +26,50 @@ run as a process group leader (e.g., using `chpst -P`). If using Omnibus or the
The MemoryKiller is controlled using environment variables.
- `SIDEKIQ_MEMORY_KILLER_MAX_RSS`: if this variable is set, and its value is
greater than 0, then after each Sidekiq job, the MemoryKiller will check the
RSS of the Sidekiq process that executed the job. If the RSS of the Sidekiq
process (expressed in kilobytes) exceeds SIDEKIQ_MEMORY_KILLER_MAX_RSS, a
delayed shutdown is triggered. The default value for Omnibus packages is set
- `SIDEKIQ_DAEMON_MEMORY_KILLER`: defaults to 0. When set to 1, the MemoryKiller
works in _daemon_ mode. Otherwise, the MemoryKiller works in _legacy_ mode.
In _legacy_ mode, the MemoryKiller checks the Sidekiq process RSS after each job.
In _daemon_ mode, the MemoryKiller checks the Sidekiq process RSS every 3 seconds
(defined by `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL`).
- `SIDEKIQ_MEMORY_KILLER_MAX_RSS`: if this variable is set, and its value is greater
than 0, the MemoryKiller is enabled. Otherwise the MemoryKiller is disabled.
`SIDEKIQ_MEMORY_KILLER_MAX_RSS` defines the Sidekiq process allowed RSS.
In _legacy_ mode, if the Sidekiq process exceeds the allowed RSS then an irreversible
delayed graceful restart will be triggered. The restart of Sidekiq will happen
after `SIDEKIQ_MEMORY_KILLER_GRACE_TIME` seconds.
In _daemon_ mode, if the Sidekiq process exceeds the allowed RSS for longer than
`SIDEKIQ_MEMORY_KILLER_GRACE_TIME` the graceful restart will be triggered. If the
Sidekiq process go below the allowed RSS within `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`,
the restart will be aborted.
The default value for Omnibus packages is set
[in the omnibus-gitlab
repository](https://gitlab.com/gitlab-org/omnibus-gitlab/blob/master/files/gitlab-cookbooks/gitlab/attributes/default.rb).
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes). When
a shutdown is triggered, the Sidekiq process will keep working normally for
another 15 minutes.
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. When the grace
time has expired, the MemoryKiller tells Sidekiq to stop accepting new jobs.
Existing jobs get 30 seconds to finish. After that, the MemoryKiller tells
Sidekiq to shut down, and an external supervision mechanism (e.g. Runit) must
restart Sidekiq.
- `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS`: is used by _daemon_ mode. If the Sidekiq
process RSS (expressed in kilobytes) exceeds `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS`,
an immediate graceful restart of Sidekiq is triggered.
- `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL`: used in _daemon_ mode to define how
often to check process RSS, default to 3 seconds.
- `SIDEKIQ_MEMORY_KILLER_GRACE_TIME`: defaults to 900 seconds (15 minutes).
The usage of this variable is described as part of `SIDEKIQ_MEMORY_KILLER_MAX_RSS`.
- `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT`: defaults to 30 seconds. This defines the
maximum time allowed for all Sidekiq jobs to finish. No new jobs will be accepted
during that time, and the process will exit as soon as all jobs finish.
If jobs do not finish during that time, the MemoryKiller will interrupt all currently
running jobs by sending `SIGTERM` to the Sidekiq process.
If the process hard shutdown/restart is not performed by Sidekiq,
the Sidekiq process will be forcefully terminated after
`Sidekiq.options[:timeout] * 2` seconds. An external supervision mechanism
(e.g. Runit) must restart Sidekiq afterwards.
......@@ -31,7 +31,9 @@ Read through the current performance problems using the Import/Export below.
Out of memory (OOM) errors are normally caused by the [Sidekiq Memory Killer](../administration/operations/sidekiq_memory_killer.md):
```bash
SIDEKIQ_MEMORY_KILLER_MAX_RSS = 2GB in GitLab.com
SIDEKIQ_MEMORY_KILLER_MAX_RSS = 2000000
SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS = 3000000
SIDEKIQ_MEMORY_KILLER_GRACE_TIME = 900
```
An import status `started`, and the following sidekiq logs will signal a memory issue:
......
......@@ -179,8 +179,12 @@ and the following environment variables:
| Setting | GitLab.com | Default |
|-------- |----------- |-------- |
| `SIDEKIQ_MEMORY_KILLER_MAX_RSS` | `1000000` | `2000000` |
| `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_SIGNAL` | `SIGKILL` | - |
| `SIDEKIQ_DAEMON_MEMORY_KILLER` | - | - |
| `SIDEKIQ_MEMORY_KILLER_MAX_RSS` | `16000000` | `2000000` |
| `SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS` | - | - |
| `SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL` | - | `3` |
| `SIDEKIQ_MEMORY_KILLER_GRACE_TIME` | - | `900` |
| `SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT` | - | `30` |
| `SIDEKIQ_LOG_ARGUMENTS` | `1` | - |
## Cron jobs
......
# frozen_string_literal: true
module Gitlab
module SidekiqDaemon
class MemoryKiller < Daemon
include ::Gitlab::Utils::StrongMemoize
# Today 64-bit CPU support max 256T memory. It is big enough.
MAX_MEMORY_KB = 256 * 1024 * 1024 * 1024
# RSS below `soft_limit_rss` is considered safe
SOFT_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_MAX_RSS', 2000000).to_i
# RSS above `hard_limit_rss` will be stopped
HARD_LIMIT_RSS_KB = ENV.fetch('SIDEKIQ_MEMORY_KILLER_HARD_LIMIT_RSS', MAX_MEMORY_KB).to_i
# RSS in range (soft_limit_rss, hard_limit_rss) is allowed for GRACE_BALLOON_SECONDS
GRACE_BALLOON_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_GRACE_TIME', 15 * 60).to_i
# Check RSS every CHECK_INTERVAL_SECONDS, minimum 2 seconds
CHECK_INTERVAL_SECONDS = [ENV.fetch('SIDEKIQ_MEMORY_KILLER_CHECK_INTERVAL', 3).to_i, 2].max
# Give Sidekiq up to 30 seconds to allow existing jobs to finish after exceeding the limit
SHUTDOWN_TIMEOUT_SECONDS = ENV.fetch('SIDEKIQ_MEMORY_KILLER_SHUTDOWN_WAIT', 30).to_i
def initialize
super
@enabled = true
end
private
def start_working
Sidekiq.logger.info(
class: self.class.to_s,
action: 'start',
pid: pid,
message: 'Starting Gitlab::SidekiqDaemon::MemoryKiller Daemon'
)
while enabled?
begin
restart_sidekiq unless rss_within_range?
sleep(CHECK_INTERVAL_SECONDS)
rescue => e
log_exception(e, __method__)
rescue Exception => e # rubocop:disable Lint/RescueException
log_exception(e, __method__ )
raise e
end
end
ensure
Sidekiq.logger.warn(
class: self.class.to_s,
action: 'stop',
pid: pid,
message: 'Stopping Gitlab::SidekiqDaemon::MemoryKiller Daemon'
)
end
def log_exception(exception, method)
Sidekiq.logger.warn(
class: self.class.to_s,
pid: pid,
message: "Exception from #{method}: #{exception.message}"
)
end
def stop_working
@enabled = false
end
def enabled?
@enabled
end
def restart_sidekiq
# Tell Sidekiq to stop fetching new jobs
# We first SIGNAL and then wait given time
# We also monitor a number of running jobs and allow to restart early
signal_and_wait(SHUTDOWN_TIMEOUT_SECONDS, 'SIGTSTP', 'stop fetching new jobs')
return unless enabled?
# Tell sidekiq to restart itself
# Keep extra safe to wait `Sidekiq.options[:timeout] + 2` seconds before SIGKILL
signal_and_wait(Sidekiq.options[:timeout] + 2, 'SIGTERM', 'gracefully shut down')
return unless enabled?
# Ideally we should never reach this condition
# Wait for Sidekiq to shutdown gracefully, and kill it if it didn't
# Kill the whole pgroup, so we can be sure no children are left behind
signal_pgroup('SIGKILL', 'die')
end
def rss_within_range?
current_rss = nil
deadline = Time.now + GRACE_BALLOON_SECONDS.seconds
loop do
return true unless enabled?
current_rss = get_rss
# RSS go above hard limit should trigger forcible shutdown right away
break if current_rss > hard_limit_rss
# RSS go below the soft limit
return true if current_rss < soft_limit_rss
# RSS did not go below the soft limit within deadline, restart
break if Time.now > deadline
sleep(CHECK_INTERVAL_SECONDS)
end
log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
false
end
def log_rss_out_of_range(current_rss, hard_limit_rss, soft_limit_rss)
Sidekiq.logger.warn(
class: self.class.to_s,
pid: pid,
message: 'Sidekiq worker RSS out of range',
current_rss: current_rss,
hard_limit_rss: hard_limit_rss,
soft_limit_rss: soft_limit_rss,
reason: out_of_range_description(current_rss, hard_limit_rss, soft_limit_rss)
)
end
def out_of_range_description(rss, hard_limit, soft_limit)
if rss > hard_limit
"current_rss(#{rss}) > hard_limit_rss(#{hard_limit})"
else
"current_rss(#{rss}) > soft_limit_rss(#{soft_limit}) longer than GRACE_BALLOON_SECONDS(#{GRACE_BALLOON_SECONDS})"
end
end
def get_rss
output, status = Gitlab::Popen.popen(%W(ps -o rss= -p #{pid}), Rails.root.to_s)
return 0 unless status&.zero?
output.to_i
end
def soft_limit_rss
SOFT_LIMIT_RSS_KB + rss_increase_by_jobs
end
def hard_limit_rss
HARD_LIMIT_RSS_KB
end
def signal_and_wait(time, signal, explanation)
Sidekiq.logger.warn(
class: self.class.to_s,
pid: pid,
signal: signal,
explanation: explanation,
wait_time: time,
message: "Sending signal and waiting"
)
Process.kill(signal, pid)
deadline = Time.now + time
# we try to finish as early as all jobs finished
# so we retest that in loop
sleep(CHECK_INTERVAL_SECONDS) while enabled? && any_jobs? && Time.now < deadline
end
def signal_pgroup(signal, explanation)
if Process.getpgrp == pid
pid_or_pgrp_str = 'PGRP'
pid_to_signal = 0
else
pid_or_pgrp_str = 'PID'
pid_to_signal = pid
end
Sidekiq.logger.warn(
class: self.class.to_s,
signal: signal,
pid: pid,
message: "sending Sidekiq worker #{pid_or_pgrp_str}-#{pid} #{signal} (#{explanation})"
)
Process.kill(signal, pid_to_signal)
end
def rss_increase_by_jobs
Gitlab::SidekiqDaemon::Monitor.instance.jobs.sum do |job| # rubocop:disable CodeReuse/ActiveRecord
rss_increase_by_job(job)
end
end
def rss_increase_by_job(job)
memory_growth_kb = get_job_options(job, 'memory_killer_memory_growth_kb', 0).to_i
max_memory_growth_kb = get_job_options(job, 'memory_killer_max_memory_growth_kb', MAX_MEMORY_KB).to_i
return 0 if memory_growth_kb.zero?
time_elapsed = Time.now.to_i - job[:started_at]
[memory_growth_kb * time_elapsed, max_memory_growth_kb].min
end
def get_job_options(job, key, default)
job[:worker_class].sidekiq_options.fetch(key, default)
rescue
default
end
def pid
Process.pid
end
def any_jobs?
Gitlab::SidekiqDaemon::Monitor.instance.jobs.any?
end
end
end
end
......@@ -14,19 +14,19 @@ module Gitlab
# that should not be caught by application
CancelledError = Class.new(Exception) # rubocop:disable Lint/InheritException
attr_reader :jobs_thread
attr_reader :jobs
attr_reader :jobs_mutex
def initialize
super
@jobs_thread = {}
@jobs = {}
@jobs_mutex = Mutex.new
end
def within_job(jid, queue)
def within_job(worker_class, jid, queue)
jobs_mutex.synchronize do
jobs_thread[jid] = Thread.current
jobs[jid] = { worker_class: worker_class, thread: Thread.current, started_at: Time.now.to_i }
end
if cancelled?(jid)
......@@ -43,7 +43,7 @@ module Gitlab
yield
ensure
jobs_mutex.synchronize do
jobs_thread.delete(jid)
jobs.delete(jid)
end
end
......@@ -62,6 +62,9 @@ module Gitlab
private
def start_working
return unless notification_channel_enabled?
begin
Sidekiq.logger.info(
class: self.class.to_s,
action: 'start',
......@@ -80,6 +83,7 @@ module Gitlab
message: 'Stopping Monitor Daemon'
)
end
end
def stop_working
thread.raise(Interrupt) if thread.alive?
......@@ -156,7 +160,7 @@ module Gitlab
# This is why it passes thread in block,
# to ensure that we do process this thread
def find_thread_unsafe(jid)
jobs_thread[jid]
jobs.dig(jid, :thread)
end
def find_thread_with_lock(jid)
......@@ -179,6 +183,10 @@ module Gitlab
def self.cancel_job_key(jid)
"sidekiq:cancel:#{jid}"
end
def notification_channel_enabled?
ENV.fetch("SIDEKIQ_MONITOR_WORKER", 0).to_i.nonzero?
end
end
end
end
......@@ -4,7 +4,7 @@ module Gitlab
module SidekiqMiddleware
class Monitor
def call(worker, job, queue)
Gitlab::SidekiqDaemon::Monitor.instance.within_job(job['jid'], queue) do
Gitlab::SidekiqDaemon::Monitor.instance.within_job(worker.class, job['jid'], queue) do
yield
end
rescue Gitlab::SidekiqDaemon::Monitor::CancelledError
......
This diff is collapsed.
......@@ -8,12 +8,12 @@ describe Gitlab::SidekiqDaemon::Monitor do
describe '#within_job' do
it 'tracks thread' do
blk = proc do
expect(monitor.jobs_thread['jid']).not_to be_nil
expect(monitor.jobs.dig('jid', :thread)).not_to be_nil
"OK"
end
expect(monitor.within_job('jid', 'queue', &blk)).to eq("OK")
expect(monitor.within_job('worker_class', 'jid', 'queue', &blk)).to eq("OK")
end
context 'when job is canceled' do
......@@ -25,19 +25,34 @@ describe Gitlab::SidekiqDaemon::Monitor do
it 'does not execute a block' do
expect do |blk|
monitor.within_job(jid, 'queue', &blk)
monitor.within_job('worker_class', jid, 'queue', &blk)
rescue described_class::CancelledError
end.not_to yield_control
end
it 'raises exception' do
expect { monitor.within_job(jid, 'queue') }.to raise_error(
expect { monitor.within_job('worker_class', jid, 'queue') }.to raise_error(
described_class::CancelledError)
end
end
end
describe '#start_working' do
describe '#start_working when notification channel not enabled' do
subject { monitor.send(:start_working) }
it 'return directly' do
allow(monitor).to receive(:notification_channel_enabled?).and_return(nil)
expect(Sidekiq.logger).not_to receive(:info)
expect(Sidekiq.logger).not_to receive(:warn)
expect(monitor).not_to receive(:enabled?)
expect(monitor).not_to receive(:process_messages)
subject
end
end
describe '#start_working when notification channel enabled' do
subject { monitor.send(:start_working) }
before do
......@@ -45,6 +60,7 @@ describe Gitlab::SidekiqDaemon::Monitor do
# we toggle `enabled?` flag after the first call
stub_const('Gitlab::SidekiqDaemon::Monitor::RECONNECT_TIME', 0)
allow(monitor).to receive(:enabled?).and_return(true, false)
allow(monitor).to receive(:notification_channel_enabled?).and_return(1)
allow(Sidekiq.logger).to receive(:info)
allow(Sidekiq.logger).to receive(:warn)
......@@ -204,7 +220,7 @@ describe Gitlab::SidekiqDaemon::Monitor do
let(:thread) { Thread.new { sleep 1000 } }
before do
monitor.jobs_thread[jid] = thread
monitor.jobs[jid] = { worker_class: 'worker_class', thread: thread, started_at: Time.now.to_i }
end
after do
......@@ -258,4 +274,24 @@ describe Gitlab::SidekiqDaemon::Monitor do
subject
end
end
describe '#notification_channel_enabled?' do
subject { monitor.send(:notification_channel_enabled?) }
it 'return nil when SIDEKIQ_MONITOR_WORKER is not set' do
expect(subject).to be nil
end
it 'return nil when SIDEKIQ_MONITOR_WORKER set to 0' do
allow(ENV).to receive(:fetch).with('SIDEKIQ_MONITOR_WORKER', 0).and_return("0")
expect(subject).to be nil
end
it 'return 1 when SIDEKIQ_MONITOR_WORKER set to 1' do
allow(ENV).to receive(:fetch).with('SIDEKIQ_MONITOR_WORKER', 0).and_return("1")
expect(subject).to be 1
end
end
end
......@@ -12,7 +12,7 @@ describe Gitlab::SidekiqMiddleware::Monitor do
it 'calls Gitlab::SidekiqDaemon::Monitor' do
expect(Gitlab::SidekiqDaemon::Monitor.instance).to receive(:within_job)
.with('job-id', 'my-queue')
.with(anything, 'job-id', 'my-queue')
.and_call_original
expect { |blk| monitor.call(worker, job, queue, &blk) }.to yield_control
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment