Use a custom Ruby patch to instrument memory usage

This uses a custom RubyVM patch to instrument per-request/per-worker memory allocations considering that GitLab runs in a multi-threaded environment. This also slightly refactors a way how we start instrumentation.

Use a custom Ruby patch to instrument memory usage
This uses a custom RubyVM patch to instrument per-request/per-worker memory allocations considering that GitLab runs in a multi-threaded environment. This also slightly refactors a way how we start instrumentation.
e96a85ad · Kamil Trzciński · ebde0372 · e96a85ad · e96a85ad · e96a85ad
Commit e96a85ad authored Jan 22, 2021 by Kamil Trzciński
16 changed files
--- a/changelogs/unreleased/instrument-and-log-memory-usage.yml
+++ b/changelogs/unreleased/instrument-and-log-memory-usage.yml
+---
+title: Use a custom Ruby patch to instrument memory usage
+merge_request: 52306
+author:
+type: added
--- a/config/feature_flags/development/trace_memory_allocations.yml
+++ b/config/feature_flags/development/trace_memory_allocations.yml
+---
+name: trace_memory_allocations
+introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/52306
+rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/299524
+milestone: '13.9'
+type: development
+group: group::memory
+default_enabled: false
--- a/lib/gitlab/instrumentation_helper.rb
+++ b/lib/gitlab/instrumentation_helper.rb
@@ -7,16 +7,32 @@ module Gitlab
    DURATION_PRECISION = 6 # microseconds

    def keys
-      @keys ||= [:gitaly_calls,
-                 :gitaly_duration_s,
-                 :rugged_calls,
-                 :rugged_duration_s,
-                 :elasticsearch_calls,
-                 :elasticsearch_duration_s,
-                 *::Gitlab::Instrumentation::Redis.known_payload_keys,
-                 *::Gitlab::Metrics::Subscribers::ActiveRecord::DB_COUNTERS,
-                 *::Gitlab::Metrics::Subscribers::ExternalHttp::KNOWN_PAYLOAD_KEYS,
-                 *::Gitlab::Metrics::Subscribers::RackAttack::PAYLOAD_KEYS]
+      @keys ||= [
+        :cpu_s,
+        :gitaly_calls,
+        :gitaly_duration_s,
+        :rugged_calls,
+        :rugged_duration_s,
+        :elasticsearch_calls,
+        :elasticsearch_duration_s,
+        *::Gitlab::Memory::Instrumentation::KEY_MAPPING.values,
+        *::Gitlab::Instrumentation::Redis.known_payload_keys,
+        *::Gitlab::Metrics::Subscribers::ActiveRecord::DB_COUNTERS,
+        *::Gitlab::Metrics::Subscribers::ExternalHttp::KNOWN_PAYLOAD_KEYS,
+        *::Gitlab::Metrics::Subscribers::RackAttack::PAYLOAD_KEYS
+      ]
+    end
+
+    def init_instrumentation_data(request_ip: nil)
+      # Set `request_start_time` only if this is request
+      # This is done, as `request_start_time` imply `request_deadline`
+      if request_ip
+        Gitlab::RequestContext.instance.client_ip = request_ip
+        Gitlab::RequestContext.instance.request_start_time = Gitlab::Metrics::System.real_time
+      end
+
+      Gitlab::RequestContext.instance.start_thread_cpu_time = Gitlab::Metrics::System.thread_cpu_time
+      Gitlab::RequestContext.instance.thread_memory_allocations = Gitlab::Memory::Instrumentation.start_thread_memory_allocations
    end

    def add_instrumentation_data(payload)
@@ -28,6 +44,8 @@ module Gitlab
      instrument_active_record(payload)
      instrument_external_http(payload)
      instrument_rack_attack(payload)
+      instrument_cpu(payload)
+      instrument_thread_memory_allocations(payload)
    end

    def instrument_gitaly(payload)
@@ -89,6 +107,19 @@ module Gitlab
      payload.merge!(::Gitlab::Metrics::Subscribers::RackAttack.payload)
    end

+    def instrument_cpu(payload)
+      cpu_s = ::Gitlab::Metrics::System.thread_cpu_duration(
+        ::Gitlab::RequestContext.instance.start_thread_cpu_time)
+
+      payload[:cpu_s] = cpu_s.round(DURATION_PRECISION) if cpu_s
+    end
+
+    def instrument_thread_memory_allocations(payload)
+      counters = ::Gitlab::Memory::Instrumentation.measure_thread_memory_allocations(
+        ::Gitlab::RequestContext.instance.thread_memory_allocations)
+      payload.merge!(counters) if counters
+    end
+
    # Returns the queuing duration for a Sidekiq job in seconds, as a float, if the
    # `enqueued_at` field or `created_at` field is available.
    #
@@ -115,7 +146,7 @@ module Gitlab
    #
    # @param [Time] start
    def self.elapsed_by_absolute_time(start)
-      (Time.now - start).to_f.round(6)
+      (Time.now - start).to_f.round(DURATION_PRECISION)
    end
    private_class_method :elapsed_by_absolute_time


--- a/lib/gitlab/lograge/custom_options.rb
+++ b/lib/gitlab/lograge/custom_options.rb
@@ -29,10 +29,6 @@ module Gitlab
        payload[:etag_route] = event.payload[:etag_route] if event.payload[:etag_route]
        payload[Labkit::Correlation::CorrelationId::LOG_KEY] = event.payload[Labkit::Correlation::CorrelationId::LOG_KEY] || Labkit::Correlation::CorrelationId.current_id

-        if cpu_s = Gitlab::Metrics::System.thread_cpu_duration(::Gitlab::RequestContext.instance.start_thread_cpu_time)
-          payload[:cpu_s] = cpu_s.round(2)
-        end
-
        CLOUDFLARE_CUSTOM_HEADERS.each do |_, value|
          payload[value] = event.payload[value] if event.payload[value]
        end

--- a/lib/gitlab/memory/instrumentation.rb
+++ b/lib/gitlab/memory/instrumentation.rb
+# frozen_string_literal: true
+
+# This class uses a custom Ruby patch to allow
+# a per-thread memory allocation tracking in a efficient manner
+#
+# This concept is currently tried to be upstreamed here:
+# - https://github.com/ruby/ruby/pull/3978
+module Gitlab
+  module Memory
+    class Instrumentation
+      KEY_MAPPING = {
+        total_allocated_objects: :mem_objects,
+        total_malloc_bytes: :mem_bytes,
+        total_mallocs: :mem_mallocs
+      }.freeze
+
+      MUTEX = Mutex.new
+
+      def self.available?
+        Thread.respond_to?(:trace_memory_allocations=) &&
+          Thread.current.respond_to?(:memory_allocations)
+      end
+
+      # This method changes a global state
+      def self.ensure_feature_flag!
+        return unless available?
+
+        enabled = Feature.enabled?(:trace_memory_allocations)
+        return if enabled == Thread.trace_memory_allocations
+
+        MUTEX.synchronize do
+          # This enables or disables feature dynamically
+          # based on a feature flag
+          Thread.trace_memory_allocations = enabled
+        end
+      end
+
+      def self.start_thread_memory_allocations
+        return unless available?
+
+        ensure_feature_flag!
+
+        # it will return `nil` if disabled
+        Thread.current.memory_allocations
+      end
+
+      # This method returns a hash with the following keys:
+      # - mem_objects: a number of allocated heap slots (as reflected by GC)
+      # - mem_mallocs: a number of malloc calls
+      # - mem_bytes: a number of bytes allocated with a mallocs tied to heap slots
+      def self.measure_thread_memory_allocations(previous)
+        return unless available?
+        return unless previous
+
+        current = Thread.current.memory_allocations
+        return unless current
+
+        # calculate difference in a memory allocations
+        previous.to_h do |key, value|
+          [KEY_MAPPING.fetch(key), current[key].to_i - value]
+        end
+      end
+
+      def self.with_memory_allocations
+        previous = self.start_thread_memory_allocations
+        yield
+        self.measure_thread_memory_allocations(previous)
+      end
+    end
+  end
+end
--- a/lib/gitlab/middleware/request_context.rb
+++ b/lib/gitlab/middleware/request_context.rb
@@ -16,9 +16,7 @@ module Gitlab
        # load balancer's IP.
        req = Rack::Request.new(env)

-        Gitlab::RequestContext.instance.client_ip = req.ip
-        Gitlab::RequestContext.instance.start_thread_cpu_time = Gitlab::Metrics::System.thread_cpu_time
-        Gitlab::RequestContext.instance.request_start_time = Gitlab::Metrics::System.real_time
+        ::Gitlab::InstrumentationHelper.init_instrumentation_data(request_ip: req.ip)

        @app.call(env)
      end

--- a/lib/gitlab/request_context.rb
+++ b/lib/gitlab/request_context.rb
@@ -7,7 +7,7 @@ module Gitlab

    RequestDeadlineExceeded = Class.new(StandardError)

-    attr_accessor :client_ip, :start_thread_cpu_time, :request_start_time
+    attr_accessor :client_ip, :start_thread_cpu_time, :request_start_time, :thread_memory_allocations

    class << self
      def instance

--- a/lib/gitlab/sidekiq_logging/structured_logger.rb
+++ b/lib/gitlab/sidekiq_logging/structured_logger.rb
@@ -11,6 +11,7 @@ module Gitlab
      def call(job, queue)
        started_time = get_time
        base_payload = parse_job(job)
+
        ActiveRecord::LogSubscriber.reset_runtime

        Sidekiq.logger.info log_job_start(job, base_payload)
@@ -38,7 +39,9 @@ module Gitlab
      private

      def add_instrumentation_keys!(job, output_payload)
-        output_payload.merge!(job.slice(*::Gitlab::InstrumentationHelper.keys))
+        instrumentation_values = job.slice(*::Gitlab::InstrumentationHelper.keys).stringify_keys
+
+        output_payload.merge!(instrumentation_values)
      end

      def add_logging_extras!(job, output_payload)
@@ -47,10 +50,6 @@ module Gitlab
        )
      end

-      def add_db_counters!(job, output_payload)
-        output_payload.merge!(job.slice(*::Gitlab::Metrics::Subscribers::ActiveRecord::DB_COUNTERS))
-      end
-
      def log_job_start(job, payload)
        payload['message'] = "#{base_message(payload)}: start"
        payload['job_status'] = 'start'
@@ -67,7 +66,6 @@ module Gitlab
        payload = payload.dup
        add_instrumentation_keys!(job, payload)
        add_logging_extras!(job, payload)
-        add_db_counters!(job, payload)

        elapsed_time = elapsed(started_time)
        add_time_keys!(elapsed_time, payload)
@@ -93,10 +91,6 @@ module Gitlab

      def add_time_keys!(time, payload)
        payload['duration_s'] = time[:duration].round(Gitlab::InstrumentationHelper::DURATION_PRECISION)
-
-        # ignore `cpu_s` if the platform does not support Process::CLOCK_THREAD_CPUTIME_ID (time[:cputime] == 0)
-        # supported OS version can be found at: https://www.rubydoc.info/stdlib/core/2.1.6/Process:clock_gettime
-        payload['cpu_s'] = time[:cputime].round(Gitlab::InstrumentationHelper::DURATION_PRECISION) if time[:cputime] > 0
        payload['completed_at'] = Time.now.utc.to_f
      end

@@ -108,17 +102,11 @@ module Gitlab

      def elapsed(t0)
        t1 = get_time
-        {
-          duration: t1[:now] - t0[:now],
-          cputime: t1[:thread_cputime] - t0[:thread_cputime]
-        }
+        { duration: t1[:now] - t0[:now] }
      end

      def get_time
-        {
-          now: current_time,
-          thread_cputime: defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0
-        }
+        { now: current_time }
      end

      def current_time

--- a/lib/gitlab/sidekiq_middleware/instrumentation_logger.rb
+++ b/lib/gitlab/sidekiq_middleware/instrumentation_logger.rb
@@ -4,8 +4,11 @@ module Gitlab
  module SidekiqMiddleware
    class InstrumentationLogger
      def call(worker, job, queue)
+        ::Gitlab::InstrumentationHelper.init_instrumentation_data
+
        yield

+      ensure
        # The Sidekiq logger is called outside the middleware block, so
        # we need to modify the job hash to pass along this information
        # since RequestStore is only active in the Sidekiq middleware.

--- a/lib/gitlab/utils/measuring.rb
+++ b/lib/gitlab/utils/measuring.rb
@@ -53,14 +53,10 @@ module Gitlab
      end

      def with_gc_stats
-        GC.start # perform a full mark-and-sweep
-        stats_before = GC.stat
+        stats = ::Gitlab::Memory::Instrumentation.start_thread_memory_allocations
        yield
-        stats_after = GC.stat
-        @gc_stats = stats_after.map do |key, after_value|
-          before_value = stats_before[key]
-          [key, before: before_value, after: after_value, diff: after_value - before_value]
-        end.to_h
+      ensure
+        @gc_stats = ::Gitlab::Memory::Instrumentation.measure_thread_memory_allocations(stats)
      end

      def with_measure_time

--- a/spec/initializers/lograge_spec.rb
+++ b/spec/initializers/lograge_spec.rb
@@ -64,11 +64,11 @@ RSpec.describe 'lograge', type: :request do
        )

      expect(Lograge.formatter).to receive(:call)
-        .with(a_hash_including(cpu_s: 0.11))
+        .with(a_hash_including(cpu_s: 0.111112))
        .and_call_original

      expect(Lograge.logger).to receive(:send)
-        .with(anything, include('"cpu_s":0.11'))
+        .with(anything, include('"cpu_s":0.111112'))
        .and_call_original

      subject
@@ -89,6 +89,26 @@ RSpec.describe 'lograge', type: :request do
      subject
    end

+    context 'when logging memory allocations' do
+      include MemoryInstrumentationHelper
+
+      before do
+        skip_memory_instrumentation!
+      end
+
+      it 'logs memory usage metrics' do
+        expect(Lograge.formatter).to receive(:call)
+          .with(a_hash_including(:mem_objects))
+          .and_call_original
+
+        expect(Lograge.logger).to receive(:send)
+          .with(anything, include('"mem_objects":'))
+          .and_call_original
+
+        subject
+      end
+    end
+
    it 'limits param size' do
      expect(Lograge.formatter).to receive(:call)
        .with(a_hash_including(params: limited_params))

--- a/spec/lib/gitlab/instrumentation_helper_spec.rb
+++ b/spec/lib/gitlab/instrumentation_helper_spec.rb
@@ -9,12 +9,16 @@ RSpec.describe Gitlab::InstrumentationHelper do
  describe '.keys' do
    it 'returns all available payload keys' do
      expected_keys = [
+        :cpu_s,
        :gitaly_calls,
        :gitaly_duration_s,
        :rugged_calls,
        :rugged_duration_s,
        :elasticsearch_calls,
        :elasticsearch_duration_s,
+        :mem_objects,
+        :mem_bytes,
+        :mem_mallocs,
        :redis_calls,
        :redis_duration_s,
        :redis_read_bytes,
@@ -53,10 +57,14 @@ RSpec.describe Gitlab::InstrumentationHelper do

    subject { described_class.add_instrumentation_data(payload) }

-    it 'adds only DB counts by default' do
+    before do
+      described_class.init_instrumentation_data
+    end
+
+    it 'includes DB counts' do
      subject

-      expect(payload).to eq(db_count: 0, db_cached_count: 0, db_write_count: 0)
+      expect(payload).to include(db_count: 0, db_cached_count: 0, db_write_count: 0)
    end

    context 'when Gitaly calls are made' do
@@ -114,6 +122,47 @@ RSpec.describe Gitlab::InstrumentationHelper do
        expect(payload[:throttle_safelist]).to eq('foobar')
      end
    end
+
+    it 'logs cpu_s duration' do
+      subject
+
+      expect(payload).to include(:cpu_s)
+    end
+
+    context 'when logging memory allocations' do
+      include MemoryInstrumentationHelper
+
+      before do
+        skip_memory_instrumentation!
+      end
+
+      it 'logs memory usage metrics' do
+        subject
+
+        expect(payload).to include(
+          :mem_objects,
+          :mem_bytes,
+          :mem_mallocs
+        )
+      end
+
+      context 'when trace_memory_allocations is disabled' do
+        before do
+          stub_feature_flags(trace_memory_allocations: false)
+          Gitlab::Memory::Instrumentation.ensure_feature_flag!
+        end
+
+        it 'does not log memory usage metrics' do
+          subject
+
+          expect(payload).not_to include(
+            :mem_objects,
+            :mem_bytes,
+            :mem_mallocs
+          )
+        end
+      end
+    end
  end

  describe '.queue_duration_for_job' do

--- a/spec/lib/gitlab/memory/instrumentation_spec.rb
+++ b/spec/lib/gitlab/memory/instrumentation_spec.rb
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+RSpec.describe Gitlab::Memory::Instrumentation do
+  include MemoryInstrumentationHelper
+
+  before do
+    skip_memory_instrumentation!
+  end
+
+  describe '.available?' do
+    it 'returns true' do
+      expect(described_class).to be_available
+    end
+  end
+
+  describe '.start_thread_memory_allocations' do
+    subject { described_class.start_thread_memory_allocations }
+
+    context 'when feature flag trace_memory_allocations is enabled' do
+      before do
+        stub_feature_flags(trace_memory_allocations: true)
+      end
+
+      it 'a hash is returned' do
+        is_expected.not_to be_empty
+      end
+    end
+
+    context 'when feature flag trace_memory_allocations is disabled' do
+      before do
+        stub_feature_flags(trace_memory_allocations: false)
+      end
+
+      it 'a nil is returned' do
+        is_expected.to be_nil
+      end
+    end
+
+    context 'when feature is unavailable' do
+      before do
+        allow(described_class).to receive(:available?) { false }
+      end
+
+      it 'a nil is returned' do
+        is_expected.to be_nil
+      end
+    end
+  end
+
+  describe '.with_memory_allocations' do
+    let(:ntimes) { 100 }
+
+    subject do
+      described_class.with_memory_allocations do
+        Array.new(1000).map { '0' * 100 }
+      end
+    end
+
+    before do
+      expect(described_class).to receive(:start_thread_memory_allocations).and_call_original
+      expect(described_class).to receive(:measure_thread_memory_allocations).and_call_original
+    end
+
+    context 'when feature flag trace_memory_allocations is enabled' do
+      before do
+        stub_feature_flags(trace_memory_allocations: true)
+      end
+
+      it 'a hash is returned' do
+        is_expected.to include(
+          mem_objects: be > 1000,
+          mem_mallocs: be > 1000,
+          mem_bytes: be > 100_000 # 100 items * 100 bytes each
+        )
+      end
+    end
+
+    context 'when feature flag trace_memory_allocations is disabled' do
+      before do
+        stub_feature_flags(trace_memory_allocations: false)
+      end
+
+      it 'a nil is returned' do
+        is_expected.to be_nil
+      end
+    end
+
+    context 'when feature is unavailable' do
+      before do
+        allow(described_class).to receive(:available?) { false }
+      end
+
+      it 'a nil is returned' do
+        is_expected.to be_nil
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/middleware/request_context_spec.rb
+++ b/spec/lib/gitlab/middleware/request_context_spec.rb
@@ -18,9 +18,11 @@ RSpec.describe Gitlab::Middleware::RequestContext do
  end

  describe '#call' do
-    context 'setting the client ip' do
-      subject { Gitlab::RequestContext.instance.client_ip }
+    let(:instance) { Gitlab::RequestContext.instance }
+
+    subject { described_class.new(app).call(env) }

+    context 'setting the client ip' do
      context 'with X-Forwarded-For headers' do
        let(:load_balancer_ip) { '1.2.3.4' }
        let(:headers) do
@@ -33,13 +35,7 @@ RSpec.describe Gitlab::Middleware::RequestContext do
        let(:env) { Rack::MockRequest.env_for("/").merge(headers) }

        it 'returns the load balancer IP' do
-          endpoint = proc do
-            [200, {}, ["Hello"]]
-          end
-
-          described_class.new(endpoint).call(env)
-
-          expect(subject).to eq(load_balancer_ip)
+          expect { subject }.to change { instance.client_ip }.from(nil).to(load_balancer_ip)
        end
      end

@@ -47,32 +43,19 @@ RSpec.describe Gitlab::Middleware::RequestContext do
        let(:ip) { '192.168.1.11' }

        before do
-          allow_next_instance_of(Rack::Request) do |instance|
-            allow(instance).to receive(:ip).and_return(ip)
+          allow_next_instance_of(Rack::Request) do |request|
+            allow(request).to receive(:ip).and_return(ip)
          end
-          described_class.new(app).call(env)
        end

-        it { is_expected.to eq(ip) }
-      end
+        it 'sets the `client_ip`' do
+          expect { subject }.to change { instance.client_ip }.from(nil).to(ip)
+        end

-      context 'before RequestContext middleware run' do
-        it { is_expected.to be_nil }
+        it 'sets the `request_start_time`' do
+          expect { subject }.to change { instance.request_start_time }.from(nil).to(Float)
+        end
      end
    end
  end
-
-  context 'setting the thread cpu time' do
-    it 'sets the `start_thread_cpu_time`' do
-      expect { described_class.new(app).call(env) }
-        .to change { Gitlab::RequestContext.instance.start_thread_cpu_time }.from(nil).to(Float)
-    end
-  end
-
-  context 'setting the request start time' do
-    it 'sets the `request_start_time`' do
-      expect { described_class.new(app).call(env) }
-        .to change { Gitlab::RequestContext.instance.request_start_time }.from(nil).to(Float)
-    end
-  end
 end
--- a/spec/lib/gitlab/sidekiq_logging/structured_logger_spec.rb
+++ b/spec/lib/gitlab/sidekiq_logging/structured_logger_spec.rb
--- a/spec/support/memory_instrumentation_helper.rb
+++ b/spec/support/memory_instrumentation_helper.rb
+# frozen_string_literal: true
+
+# Memory instrumentation can only be done if running on a valid Ruby
+#
+# This concept is currently tried to be upstreamed here:
+# - https://github.com/ruby/ruby/pull/3978
+module MemoryInstrumentationHelper
+  def skip_memory_instrumentation!
+    return if ::Gitlab::Memory::Instrumentation.available?
+
+    # if we are running in CI, a test cannot be skipped
+    return if ENV['CI']
+
+    skip 'Missing a memory instrumentation patch. ' \
+      'More information can be found here: https://gitlab.com/gitlab-org/gitlab/-/issues/296530.'
+  end
+end