Commit e96a85ad authored by Kamil Trzciński's avatar Kamil Trzciński

Use a custom Ruby patch to instrument memory usage

This uses a custom RubyVM patch to instrument
per-request/per-worker memory allocations considering
that GitLab runs in a multi-threaded environment.

This also slightly refactors a way how we start instrumentation.
parent ebde0372
---
title: Use a custom Ruby patch to instrument memory usage
merge_request: 52306
author:
type: added
---
name: trace_memory_allocations
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/52306
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/299524
milestone: '13.9'
type: development
group: group::memory
default_enabled: false
...@@ -7,16 +7,32 @@ module Gitlab ...@@ -7,16 +7,32 @@ module Gitlab
DURATION_PRECISION = 6 # microseconds DURATION_PRECISION = 6 # microseconds
def keys def keys
@keys ||= [:gitaly_calls, @keys ||= [
:gitaly_duration_s, :cpu_s,
:rugged_calls, :gitaly_calls,
:rugged_duration_s, :gitaly_duration_s,
:elasticsearch_calls, :rugged_calls,
:elasticsearch_duration_s, :rugged_duration_s,
*::Gitlab::Instrumentation::Redis.known_payload_keys, :elasticsearch_calls,
*::Gitlab::Metrics::Subscribers::ActiveRecord::DB_COUNTERS, :elasticsearch_duration_s,
*::Gitlab::Metrics::Subscribers::ExternalHttp::KNOWN_PAYLOAD_KEYS, *::Gitlab::Memory::Instrumentation::KEY_MAPPING.values,
*::Gitlab::Metrics::Subscribers::RackAttack::PAYLOAD_KEYS] *::Gitlab::Instrumentation::Redis.known_payload_keys,
*::Gitlab::Metrics::Subscribers::ActiveRecord::DB_COUNTERS,
*::Gitlab::Metrics::Subscribers::ExternalHttp::KNOWN_PAYLOAD_KEYS,
*::Gitlab::Metrics::Subscribers::RackAttack::PAYLOAD_KEYS
]
end
def init_instrumentation_data(request_ip: nil)
# Set `request_start_time` only if this is request
# This is done, as `request_start_time` imply `request_deadline`
if request_ip
Gitlab::RequestContext.instance.client_ip = request_ip
Gitlab::RequestContext.instance.request_start_time = Gitlab::Metrics::System.real_time
end
Gitlab::RequestContext.instance.start_thread_cpu_time = Gitlab::Metrics::System.thread_cpu_time
Gitlab::RequestContext.instance.thread_memory_allocations = Gitlab::Memory::Instrumentation.start_thread_memory_allocations
end end
def add_instrumentation_data(payload) def add_instrumentation_data(payload)
...@@ -28,6 +44,8 @@ module Gitlab ...@@ -28,6 +44,8 @@ module Gitlab
instrument_active_record(payload) instrument_active_record(payload)
instrument_external_http(payload) instrument_external_http(payload)
instrument_rack_attack(payload) instrument_rack_attack(payload)
instrument_cpu(payload)
instrument_thread_memory_allocations(payload)
end end
def instrument_gitaly(payload) def instrument_gitaly(payload)
...@@ -89,6 +107,19 @@ module Gitlab ...@@ -89,6 +107,19 @@ module Gitlab
payload.merge!(::Gitlab::Metrics::Subscribers::RackAttack.payload) payload.merge!(::Gitlab::Metrics::Subscribers::RackAttack.payload)
end end
def instrument_cpu(payload)
cpu_s = ::Gitlab::Metrics::System.thread_cpu_duration(
::Gitlab::RequestContext.instance.start_thread_cpu_time)
payload[:cpu_s] = cpu_s.round(DURATION_PRECISION) if cpu_s
end
def instrument_thread_memory_allocations(payload)
counters = ::Gitlab::Memory::Instrumentation.measure_thread_memory_allocations(
::Gitlab::RequestContext.instance.thread_memory_allocations)
payload.merge!(counters) if counters
end
# Returns the queuing duration for a Sidekiq job in seconds, as a float, if the # Returns the queuing duration for a Sidekiq job in seconds, as a float, if the
# `enqueued_at` field or `created_at` field is available. # `enqueued_at` field or `created_at` field is available.
# #
...@@ -115,7 +146,7 @@ module Gitlab ...@@ -115,7 +146,7 @@ module Gitlab
# #
# @param [Time] start # @param [Time] start
def self.elapsed_by_absolute_time(start) def self.elapsed_by_absolute_time(start)
(Time.now - start).to_f.round(6) (Time.now - start).to_f.round(DURATION_PRECISION)
end end
private_class_method :elapsed_by_absolute_time private_class_method :elapsed_by_absolute_time
......
...@@ -29,10 +29,6 @@ module Gitlab ...@@ -29,10 +29,6 @@ module Gitlab
payload[:etag_route] = event.payload[:etag_route] if event.payload[:etag_route] payload[:etag_route] = event.payload[:etag_route] if event.payload[:etag_route]
payload[Labkit::Correlation::CorrelationId::LOG_KEY] = event.payload[Labkit::Correlation::CorrelationId::LOG_KEY] || Labkit::Correlation::CorrelationId.current_id payload[Labkit::Correlation::CorrelationId::LOG_KEY] = event.payload[Labkit::Correlation::CorrelationId::LOG_KEY] || Labkit::Correlation::CorrelationId.current_id
if cpu_s = Gitlab::Metrics::System.thread_cpu_duration(::Gitlab::RequestContext.instance.start_thread_cpu_time)
payload[:cpu_s] = cpu_s.round(2)
end
CLOUDFLARE_CUSTOM_HEADERS.each do |_, value| CLOUDFLARE_CUSTOM_HEADERS.each do |_, value|
payload[value] = event.payload[value] if event.payload[value] payload[value] = event.payload[value] if event.payload[value]
end end
......
# frozen_string_literal: true
# This class uses a custom Ruby patch to allow
# a per-thread memory allocation tracking in a efficient manner
#
# This concept is currently tried to be upstreamed here:
# - https://github.com/ruby/ruby/pull/3978
module Gitlab
module Memory
class Instrumentation
KEY_MAPPING = {
total_allocated_objects: :mem_objects,
total_malloc_bytes: :mem_bytes,
total_mallocs: :mem_mallocs
}.freeze
MUTEX = Mutex.new
def self.available?
Thread.respond_to?(:trace_memory_allocations=) &&
Thread.current.respond_to?(:memory_allocations)
end
# This method changes a global state
def self.ensure_feature_flag!
return unless available?
enabled = Feature.enabled?(:trace_memory_allocations)
return if enabled == Thread.trace_memory_allocations
MUTEX.synchronize do
# This enables or disables feature dynamically
# based on a feature flag
Thread.trace_memory_allocations = enabled
end
end
def self.start_thread_memory_allocations
return unless available?
ensure_feature_flag!
# it will return `nil` if disabled
Thread.current.memory_allocations
end
# This method returns a hash with the following keys:
# - mem_objects: a number of allocated heap slots (as reflected by GC)
# - mem_mallocs: a number of malloc calls
# - mem_bytes: a number of bytes allocated with a mallocs tied to heap slots
def self.measure_thread_memory_allocations(previous)
return unless available?
return unless previous
current = Thread.current.memory_allocations
return unless current
# calculate difference in a memory allocations
previous.to_h do |key, value|
[KEY_MAPPING.fetch(key), current[key].to_i - value]
end
end
def self.with_memory_allocations
previous = self.start_thread_memory_allocations
yield
self.measure_thread_memory_allocations(previous)
end
end
end
end
...@@ -16,9 +16,7 @@ module Gitlab ...@@ -16,9 +16,7 @@ module Gitlab
# load balancer's IP. # load balancer's IP.
req = Rack::Request.new(env) req = Rack::Request.new(env)
Gitlab::RequestContext.instance.client_ip = req.ip ::Gitlab::InstrumentationHelper.init_instrumentation_data(request_ip: req.ip)
Gitlab::RequestContext.instance.start_thread_cpu_time = Gitlab::Metrics::System.thread_cpu_time
Gitlab::RequestContext.instance.request_start_time = Gitlab::Metrics::System.real_time
@app.call(env) @app.call(env)
end end
......
...@@ -7,7 +7,7 @@ module Gitlab ...@@ -7,7 +7,7 @@ module Gitlab
RequestDeadlineExceeded = Class.new(StandardError) RequestDeadlineExceeded = Class.new(StandardError)
attr_accessor :client_ip, :start_thread_cpu_time, :request_start_time attr_accessor :client_ip, :start_thread_cpu_time, :request_start_time, :thread_memory_allocations
class << self class << self
def instance def instance
......
...@@ -11,6 +11,7 @@ module Gitlab ...@@ -11,6 +11,7 @@ module Gitlab
def call(job, queue) def call(job, queue)
started_time = get_time started_time = get_time
base_payload = parse_job(job) base_payload = parse_job(job)
ActiveRecord::LogSubscriber.reset_runtime ActiveRecord::LogSubscriber.reset_runtime
Sidekiq.logger.info log_job_start(job, base_payload) Sidekiq.logger.info log_job_start(job, base_payload)
...@@ -38,7 +39,9 @@ module Gitlab ...@@ -38,7 +39,9 @@ module Gitlab
private private
def add_instrumentation_keys!(job, output_payload) def add_instrumentation_keys!(job, output_payload)
output_payload.merge!(job.slice(*::Gitlab::InstrumentationHelper.keys)) instrumentation_values = job.slice(*::Gitlab::InstrumentationHelper.keys).stringify_keys
output_payload.merge!(instrumentation_values)
end end
def add_logging_extras!(job, output_payload) def add_logging_extras!(job, output_payload)
...@@ -47,10 +50,6 @@ module Gitlab ...@@ -47,10 +50,6 @@ module Gitlab
) )
end end
def add_db_counters!(job, output_payload)
output_payload.merge!(job.slice(*::Gitlab::Metrics::Subscribers::ActiveRecord::DB_COUNTERS))
end
def log_job_start(job, payload) def log_job_start(job, payload)
payload['message'] = "#{base_message(payload)}: start" payload['message'] = "#{base_message(payload)}: start"
payload['job_status'] = 'start' payload['job_status'] = 'start'
...@@ -67,7 +66,6 @@ module Gitlab ...@@ -67,7 +66,6 @@ module Gitlab
payload = payload.dup payload = payload.dup
add_instrumentation_keys!(job, payload) add_instrumentation_keys!(job, payload)
add_logging_extras!(job, payload) add_logging_extras!(job, payload)
add_db_counters!(job, payload)
elapsed_time = elapsed(started_time) elapsed_time = elapsed(started_time)
add_time_keys!(elapsed_time, payload) add_time_keys!(elapsed_time, payload)
...@@ -93,10 +91,6 @@ module Gitlab ...@@ -93,10 +91,6 @@ module Gitlab
def add_time_keys!(time, payload) def add_time_keys!(time, payload)
payload['duration_s'] = time[:duration].round(Gitlab::InstrumentationHelper::DURATION_PRECISION) payload['duration_s'] = time[:duration].round(Gitlab::InstrumentationHelper::DURATION_PRECISION)
# ignore `cpu_s` if the platform does not support Process::CLOCK_THREAD_CPUTIME_ID (time[:cputime] == 0)
# supported OS version can be found at: https://www.rubydoc.info/stdlib/core/2.1.6/Process:clock_gettime
payload['cpu_s'] = time[:cputime].round(Gitlab::InstrumentationHelper::DURATION_PRECISION) if time[:cputime] > 0
payload['completed_at'] = Time.now.utc.to_f payload['completed_at'] = Time.now.utc.to_f
end end
...@@ -108,17 +102,11 @@ module Gitlab ...@@ -108,17 +102,11 @@ module Gitlab
def elapsed(t0) def elapsed(t0)
t1 = get_time t1 = get_time
{ { duration: t1[:now] - t0[:now] }
duration: t1[:now] - t0[:now],
cputime: t1[:thread_cputime] - t0[:thread_cputime]
}
end end
def get_time def get_time
{ { now: current_time }
now: current_time,
thread_cputime: defined?(Process::CLOCK_THREAD_CPUTIME_ID) ? Process.clock_gettime(Process::CLOCK_THREAD_CPUTIME_ID) : 0
}
end end
def current_time def current_time
......
...@@ -4,8 +4,11 @@ module Gitlab ...@@ -4,8 +4,11 @@ module Gitlab
module SidekiqMiddleware module SidekiqMiddleware
class InstrumentationLogger class InstrumentationLogger
def call(worker, job, queue) def call(worker, job, queue)
::Gitlab::InstrumentationHelper.init_instrumentation_data
yield yield
ensure
# The Sidekiq logger is called outside the middleware block, so # The Sidekiq logger is called outside the middleware block, so
# we need to modify the job hash to pass along this information # we need to modify the job hash to pass along this information
# since RequestStore is only active in the Sidekiq middleware. # since RequestStore is only active in the Sidekiq middleware.
......
...@@ -53,14 +53,10 @@ module Gitlab ...@@ -53,14 +53,10 @@ module Gitlab
end end
def with_gc_stats def with_gc_stats
GC.start # perform a full mark-and-sweep stats = ::Gitlab::Memory::Instrumentation.start_thread_memory_allocations
stats_before = GC.stat
yield yield
stats_after = GC.stat ensure
@gc_stats = stats_after.map do |key, after_value| @gc_stats = ::Gitlab::Memory::Instrumentation.measure_thread_memory_allocations(stats)
before_value = stats_before[key]
[key, before: before_value, after: after_value, diff: after_value - before_value]
end.to_h
end end
def with_measure_time def with_measure_time
......
...@@ -64,11 +64,11 @@ RSpec.describe 'lograge', type: :request do ...@@ -64,11 +64,11 @@ RSpec.describe 'lograge', type: :request do
) )
expect(Lograge.formatter).to receive(:call) expect(Lograge.formatter).to receive(:call)
.with(a_hash_including(cpu_s: 0.11)) .with(a_hash_including(cpu_s: 0.111112))
.and_call_original .and_call_original
expect(Lograge.logger).to receive(:send) expect(Lograge.logger).to receive(:send)
.with(anything, include('"cpu_s":0.11')) .with(anything, include('"cpu_s":0.111112'))
.and_call_original .and_call_original
subject subject
...@@ -89,6 +89,26 @@ RSpec.describe 'lograge', type: :request do ...@@ -89,6 +89,26 @@ RSpec.describe 'lograge', type: :request do
subject subject
end end
context 'when logging memory allocations' do
include MemoryInstrumentationHelper
before do
skip_memory_instrumentation!
end
it 'logs memory usage metrics' do
expect(Lograge.formatter).to receive(:call)
.with(a_hash_including(:mem_objects))
.and_call_original
expect(Lograge.logger).to receive(:send)
.with(anything, include('"mem_objects":'))
.and_call_original
subject
end
end
it 'limits param size' do it 'limits param size' do
expect(Lograge.formatter).to receive(:call) expect(Lograge.formatter).to receive(:call)
.with(a_hash_including(params: limited_params)) .with(a_hash_including(params: limited_params))
......
...@@ -9,12 +9,16 @@ RSpec.describe Gitlab::InstrumentationHelper do ...@@ -9,12 +9,16 @@ RSpec.describe Gitlab::InstrumentationHelper do
describe '.keys' do describe '.keys' do
it 'returns all available payload keys' do it 'returns all available payload keys' do
expected_keys = [ expected_keys = [
:cpu_s,
:gitaly_calls, :gitaly_calls,
:gitaly_duration_s, :gitaly_duration_s,
:rugged_calls, :rugged_calls,
:rugged_duration_s, :rugged_duration_s,
:elasticsearch_calls, :elasticsearch_calls,
:elasticsearch_duration_s, :elasticsearch_duration_s,
:mem_objects,
:mem_bytes,
:mem_mallocs,
:redis_calls, :redis_calls,
:redis_duration_s, :redis_duration_s,
:redis_read_bytes, :redis_read_bytes,
...@@ -53,10 +57,14 @@ RSpec.describe Gitlab::InstrumentationHelper do ...@@ -53,10 +57,14 @@ RSpec.describe Gitlab::InstrumentationHelper do
subject { described_class.add_instrumentation_data(payload) } subject { described_class.add_instrumentation_data(payload) }
it 'adds only DB counts by default' do before do
described_class.init_instrumentation_data
end
it 'includes DB counts' do
subject subject
expect(payload).to eq(db_count: 0, db_cached_count: 0, db_write_count: 0) expect(payload).to include(db_count: 0, db_cached_count: 0, db_write_count: 0)
end end
context 'when Gitaly calls are made' do context 'when Gitaly calls are made' do
...@@ -114,6 +122,47 @@ RSpec.describe Gitlab::InstrumentationHelper do ...@@ -114,6 +122,47 @@ RSpec.describe Gitlab::InstrumentationHelper do
expect(payload[:throttle_safelist]).to eq('foobar') expect(payload[:throttle_safelist]).to eq('foobar')
end end
end end
it 'logs cpu_s duration' do
subject
expect(payload).to include(:cpu_s)
end
context 'when logging memory allocations' do
include MemoryInstrumentationHelper
before do
skip_memory_instrumentation!
end
it 'logs memory usage metrics' do
subject
expect(payload).to include(
:mem_objects,
:mem_bytes,
:mem_mallocs
)
end
context 'when trace_memory_allocations is disabled' do
before do
stub_feature_flags(trace_memory_allocations: false)
Gitlab::Memory::Instrumentation.ensure_feature_flag!
end
it 'does not log memory usage metrics' do
subject
expect(payload).not_to include(
:mem_objects,
:mem_bytes,
:mem_mallocs
)
end
end
end
end end
describe '.queue_duration_for_job' do describe '.queue_duration_for_job' do
......
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Gitlab::Memory::Instrumentation do
include MemoryInstrumentationHelper
before do
skip_memory_instrumentation!
end
describe '.available?' do
it 'returns true' do
expect(described_class).to be_available
end
end
describe '.start_thread_memory_allocations' do
subject { described_class.start_thread_memory_allocations }
context 'when feature flag trace_memory_allocations is enabled' do
before do
stub_feature_flags(trace_memory_allocations: true)
end
it 'a hash is returned' do
is_expected.not_to be_empty
end
end
context 'when feature flag trace_memory_allocations is disabled' do
before do
stub_feature_flags(trace_memory_allocations: false)
end
it 'a nil is returned' do
is_expected.to be_nil
end
end
context 'when feature is unavailable' do
before do
allow(described_class).to receive(:available?) { false }
end
it 'a nil is returned' do
is_expected.to be_nil
end
end
end
describe '.with_memory_allocations' do
let(:ntimes) { 100 }
subject do
described_class.with_memory_allocations do
Array.new(1000).map { '0' * 100 }
end
end
before do
expect(described_class).to receive(:start_thread_memory_allocations).and_call_original
expect(described_class).to receive(:measure_thread_memory_allocations).and_call_original
end
context 'when feature flag trace_memory_allocations is enabled' do
before do
stub_feature_flags(trace_memory_allocations: true)
end
it 'a hash is returned' do
is_expected.to include(
mem_objects: be > 1000,
mem_mallocs: be > 1000,
mem_bytes: be > 100_000 # 100 items * 100 bytes each
)
end
end
context 'when feature flag trace_memory_allocations is disabled' do
before do
stub_feature_flags(trace_memory_allocations: false)
end
it 'a nil is returned' do
is_expected.to be_nil
end
end
context 'when feature is unavailable' do
before do
allow(described_class).to receive(:available?) { false }
end
it 'a nil is returned' do
is_expected.to be_nil
end
end
end
end
...@@ -18,9 +18,11 @@ RSpec.describe Gitlab::Middleware::RequestContext do ...@@ -18,9 +18,11 @@ RSpec.describe Gitlab::Middleware::RequestContext do
end end
describe '#call' do describe '#call' do
context 'setting the client ip' do let(:instance) { Gitlab::RequestContext.instance }
subject { Gitlab::RequestContext.instance.client_ip }
subject { described_class.new(app).call(env) }
context 'setting the client ip' do
context 'with X-Forwarded-For headers' do context 'with X-Forwarded-For headers' do
let(:load_balancer_ip) { '1.2.3.4' } let(:load_balancer_ip) { '1.2.3.4' }
let(:headers) do let(:headers) do
...@@ -33,13 +35,7 @@ RSpec.describe Gitlab::Middleware::RequestContext do ...@@ -33,13 +35,7 @@ RSpec.describe Gitlab::Middleware::RequestContext do
let(:env) { Rack::MockRequest.env_for("/").merge(headers) } let(:env) { Rack::MockRequest.env_for("/").merge(headers) }
it 'returns the load balancer IP' do it 'returns the load balancer IP' do
endpoint = proc do expect { subject }.to change { instance.client_ip }.from(nil).to(load_balancer_ip)
[200, {}, ["Hello"]]
end
described_class.new(endpoint).call(env)
expect(subject).to eq(load_balancer_ip)
end end
end end
...@@ -47,32 +43,19 @@ RSpec.describe Gitlab::Middleware::RequestContext do ...@@ -47,32 +43,19 @@ RSpec.describe Gitlab::Middleware::RequestContext do
let(:ip) { '192.168.1.11' } let(:ip) { '192.168.1.11' }
before do before do
allow_next_instance_of(Rack::Request) do |instance| allow_next_instance_of(Rack::Request) do |request|
allow(instance).to receive(:ip).and_return(ip) allow(request).to receive(:ip).and_return(ip)
end end
described_class.new(app).call(env)
end end
it { is_expected.to eq(ip) } it 'sets the `client_ip`' do
end expect { subject }.to change { instance.client_ip }.from(nil).to(ip)
end
context 'before RequestContext middleware run' do it 'sets the `request_start_time`' do
it { is_expected.to be_nil } expect { subject }.to change { instance.request_start_time }.from(nil).to(Float)
end
end end
end end
end end
context 'setting the thread cpu time' do
it 'sets the `start_thread_cpu_time`' do
expect { described_class.new(app).call(env) }
.to change { Gitlab::RequestContext.instance.start_thread_cpu_time }.from(nil).to(Float)
end
end
context 'setting the request start time' do
it 'sets the `request_start_time`' do
expect { described_class.new(app).call(env) }
.to change { Gitlab::RequestContext.instance.request_start_time }.from(nil).to(Float)
end
end
end end
# frozen_string_literal: true
# Memory instrumentation can only be done if running on a valid Ruby
#
# This concept is currently tried to be upstreamed here:
# - https://github.com/ruby/ruby/pull/3978
module MemoryInstrumentationHelper
def skip_memory_instrumentation!
return if ::Gitlab::Memory::Instrumentation.available?
# if we are running in CI, a test cannot be skipped
return if ENV['CI']
skip 'Missing a memory instrumentation patch. ' \
'More information can be found here: https://gitlab.com/gitlab-org/gitlab/-/issues/296530.'
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment