Merge branch 'stackprof-sidekiq' into 'master'

Defer stackprof signal trap when running in sidekiq See merge request gitlab-org/gitlab!37988

Merge branch 'stackprof-sidekiq' into 'master'
Defer stackprof signal trap when running in sidekiq See merge request gitlab-org/gitlab!37988
239f2bc2 · Sean McGivern · b6c7e32b · af4dac41 · 239f2bc2 · 239f2bc2
Commit 239f2bc2 authored Jul 30, 2020 by Sean McGivern
Hide whitespace changes
Inline Side-by-side

Showing with 110 additions and 78 deletions

config/initializers/stackprof.rb config/initializers/stackprof.rb +106 -78

doc/development/performance.md doc/development/performance.md +4 -0

No files found.
--- a/config/initializers/stackprof.rb
+++ b/config/initializers/stackprof.rb
@@ -8,94 +8,122 @@
 # * timeout profile after 30 seconds
 # * write to $TMPDIR/stackprof.$PID.$RAND.profile
-if Gitlab::Utils.to_boolean(ENV['STACKPROF_ENABLED'].to_s)
+module Gitlab
-  Gitlab::Cluster::LifecycleEvents.on_worker_start do
+  class StackProf
-    require 'stackprof'
+    # this is a workaround for sidekiq, which defines its own SIGUSR2 handler.
-    require 'tmpdir'
+    # by defering to the sidekiq startup event, we get to set up our own
+    # handler late enough.
+    # see also: https://github.com/mperham/sidekiq/pull/4653
+    def self.install
+      require 'stackprof'
+      require 'tmpdir'
+      if Gitlab::Runtime.sidekiq?
+        Sidekiq.configure_server do |config|
+          config.on :startup do
+            on_worker_start
+          end
+        end
+      else
+        Gitlab::Cluster::LifecycleEvents.on_worker_start do
+          on_worker_start
+        end
+      end
+    end
-    Gitlab::AppJsonLogger.info "stackprof: listening on SIGUSR2 signal"
+    def self.on_worker_start
+      Gitlab::AppJsonLogger.info(
+        event: "stackprof",
+        message: "listening on SIGUSR2 signal",
+        pid: Process.pid
+      )
-    # create a pipe in order to propagate signal out of the signal handler
+      # create a pipe in order to propagate signal out of the signal handler
-    # see also: https://cr.yp.to/docs/selfpipe.html
+      # see also: https://cr.yp.to/docs/selfpipe.html
-    read, write = IO.pipe
+      read, write = IO.pipe
-    # create a separate thread that polls for signals on the pipe.
+      # create a separate thread that polls for signals on the pipe.
-    #
+      #
-    # this way we do not execute in signal handler context, which
+      # this way we do not execute in signal handler context, which
-    # lifts restrictions and also serializes the calls in a thread-safe
+      # lifts restrictions and also serializes the calls in a thread-safe
-    # manner.
+      # manner.
-    #
+      #
-    # it's very similar to a goroutine and channel design.
+      # it's very similar to a goroutine and channel design.
-    #
+      #
-    # another nice benefit of this method is that we can timeout the
+      # another nice benefit of this method is that we can timeout the
-    # IO.select call, allowing the profile to automatically stop after
+      # IO.select call, allowing the profile to automatically stop after
-    # a given interval (by default 30 seconds), avoiding unbounded memory
+      # a given interval (by default 30 seconds), avoiding unbounded memory
-    # growth from a profile that was started and never stopped.
+      # growth from a profile that was started and never stopped.
-    t = Thread.new do
+      t = Thread.new do
-      timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || 30
+        timeout_s = ENV['STACKPROF_TIMEOUT_S']&.to_i || 30
-      current_timeout_s = nil
+        current_timeout_s = nil
-      loop do
+        loop do
-        got_value = IO.select([read], nil, nil, current_timeout_s)
+          got_value = IO.select([read], nil, nil, current_timeout_s)
-        read.getbyte if got_value
+          read.getbyte if got_value
-        if StackProf.running?
+          if ::StackProf.running?
-          stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || Dir.tmpdir
+            stackprof_file_prefix = ENV['STACKPROF_FILE_PREFIX'] || Dir.tmpdir
-          stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile"
+            stackprof_out_file = "#{stackprof_file_prefix}/stackprof.#{Process.pid}.#{SecureRandom.hex(6)}.profile"
-          Gitlab::AppJsonLogger.info(
+            Gitlab::AppJsonLogger.info(
-            event: "stackprof",
+              event: "stackprof",
-            message: "stopping profile",
+              message: "stopping profile",
-            output_filename: stackprof_out_file,
+              output_filename: stackprof_out_file,
-            pid: Process.pid,
+              pid: Process.pid,
-            timeout_s: timeout_s,
+              timeout_s: timeout_s,
-            timed_out: got_value.nil?
+              timed_out: got_value.nil?
-          )
+            )
-          StackProf.stop
+            ::StackProf.stop
-          StackProf.results(stackprof_out_file)
+            ::StackProf.results(stackprof_out_file)
-          current_timeout_s = nil
+            current_timeout_s = nil
-        else
+          else
-          Gitlab::AppJsonLogger.info(
+            Gitlab::AppJsonLogger.info(
-            event: "stackprof",
+              event: "stackprof",
-            message: "starting profile",
+              message: "starting profile",
-            pid: Process.pid
+              pid: Process.pid
-          )
+            )
-          StackProf.start(
+            ::StackProf.start(
-            mode: :cpu,
+              mode: :cpu,
-            raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'),
+              raw: Gitlab::Utils.to_boolean(ENV['STACKPROF_RAW'] || 'true'),
-            interval: ENV['STACKPROF_INTERVAL_US']&.to_i || 10_000
+              interval: ENV['STACKPROF_INTERVAL_US']&.to_i || 10_000
-          )
+            )
-          current_timeout_s = timeout_s
+            current_timeout_s = timeout_s
+          end
        end
      end
-    end
+      t.abort_on_exception = true
-    t.abort_on_exception = true
-    # in the case of puma, this will override the existing SIGUSR2 signal handler
+      # in the case of puma, this will override the existing SIGUSR2 signal handler
-    # that can be used to trigger a restart.
+      # that can be used to trigger a restart.
-    #
+      #
-    # puma cluster has two types of restarts:
+      # puma cluster has two types of restarts:
-    # * SIGUSR1: phased restart
+      # * SIGUSR1: phased restart
-    # * SIGUSR2: restart
+      # * SIGUSR2: restart
-    #
+      #
-    # phased restart is not supported in our configuration, because we use
+      # phased restart is not supported in our configuration, because we use
-    # preload_app. this means we will always perform a normal restart.
+      # preload_app. this means we will always perform a normal restart.
-    # additionally, phased restart is not supported when sending a SIGUSR2
+      # additionally, phased restart is not supported when sending a SIGUSR2
-    # directly to a puma worker (as opposed to the master process).
+      # directly to a puma worker (as opposed to the master process).
-    #
+      #
-    # the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in
+      # the result is that the behaviour of SIGUSR1 and SIGUSR2 is identical in
-    # our configuration, and we can always use a SIGUSR1 to perform a restart.
+      # our configuration, and we can always use a SIGUSR1 to perform a restart.
-    #
+      #
-    # thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and
+      # thus, it is acceptable for us to re-appropriate the SIGUSR2 signal, and
-    # override the puma behaviour.
+      # override the puma behaviour.
-    #
+      #
-    # see also:
+      # see also:
-    # * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals
+      # * https://github.com/puma/puma/blob/master/docs/signals.md#puma-signals
-    # * https://github.com/phusion/unicorn/blob/master/SIGNALS
+      # * https://github.com/phusion/unicorn/blob/master/SIGNALS
-    # * https://github.com/mperham/sidekiq/wiki/Signals
+      # * https://github.com/mperham/sidekiq/wiki/Signals
-    Signal.trap('SIGUSR2') do
+      Signal.trap('SIGUSR2') do
-      write.write('.')
+        write.write('.')
+      end
    end
  end
 end
+if Gitlab::Utils.to_boolean(ENV['STACKPROF_ENABLED'].to_s)
+  Gitlab::StackProf.install
+end
--- a/doc/development/performance.md
+++ b/doc/development/performance.md
@@ -281,6 +281,10 @@ This can be done via `pkill -USR2 puma:`. The `:` disambiguates between `puma
 4.3.3.gitlab.2 ...` (the master process) from `puma: cluster worker 0: ...` (the
 worker processes), selecting the latter.
+For Sidekiq, the signal can be sent to the `sidekiq-cluster` process via `pkill
+-USR2 bin/sidekiq-cluster`, which will forward the signal to all Sidekiq
+children. Alternatively, you can also select a specific pid of interest.
 Production profiles can be especially noisy. It can be helpful to visualize them
 as a [flamegraph](https://github.com/brendangregg/FlameGraph). This can be done
 via: