Commit 987b4046 authored by Grzegorz Bizon's avatar Grzegorz Bizon

Add additional metrics related to builds queuing

This commit adds additional metrics related to how GitLab queues CI/CD
builds and what operations are being performed to pick a build from the
queue and expose it to GitLab Runner.
parent 6882f4c0
...@@ -13,8 +13,18 @@ module Ci ...@@ -13,8 +13,18 @@ module Ci
@metrics = ::Gitlab::Ci::Queue::Metrics.new(runner) @metrics = ::Gitlab::Ci::Queue::Metrics.new(runner)
end end
# rubocop: disable CodeReuse/ActiveRecord
def execute(params = {}) def execute(params = {})
@metrics.observe_queue_time do
process_queue(params)
end
end
private
# rubocop: disable CodeReuse/ActiveRecord
def process_queue(params)
@metrics.increment_queue_operation(:queue_attempt)
builds = builds =
if runner.instance_type? if runner.instance_type?
builds_for_shared_runner builds_for_shared_runner
...@@ -24,8 +34,6 @@ module Ci ...@@ -24,8 +34,6 @@ module Ci
builds_for_project_runner builds_for_project_runner
end end
valid = true
# pick builds that does not have other tags than runner's one # pick builds that does not have other tags than runner's one
builds = builds.matches_tag_ids(runner.tags.ids) builds = builds.matches_tag_ids(runner.tags.ids)
...@@ -39,14 +47,23 @@ module Ci ...@@ -39,14 +47,23 @@ module Ci
builds = builds.queued_before(params[:job_age].seconds.ago) builds = builds.queued_before(params[:job_age].seconds.ago)
end end
builds.each do |build| @metrics.observe_queue_depth(:depth, builds.to_a.size)
valid = true
depth = 0
builds.each_with_index do |build, index|
depth = index
@metrics.increment_queue_operation(:queue_iteration)
result = process_build(build, params) result = process_build(build, params)
next unless result next unless result
if result.valid? if result.valid?
@metrics.register_success(result.build) @metrics.register_success(result.build)
@metrics.observe_queue_depth(:effective, depth)
return result return result # rubocop:disable Cop/AvoidReturnFromBlocks
else else
# The usage of valid: is described in # The usage of valid: is described in
# handling of ActiveRecord::StaleObjectError # handling of ActiveRecord::StaleObjectError
...@@ -54,22 +71,29 @@ module Ci ...@@ -54,22 +71,29 @@ module Ci
end end
end end
@metrics.increment_queue_operation(:queue_conflict) unless valid
@metrics.observe_queue_depth(:ineffective, depth) unless valid
@metrics.register_failure @metrics.register_failure
Result.new(nil, nil, valid) Result.new(nil, nil, valid)
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
private
def process_build(build, params) def process_build(build, params)
return unless runner.can_pick?(build) if runner.can_pick?(build)
@metrics.increment_queue_operation(:build_can_pick)
else
@metrics.increment_queue_operation(:build_not_pick)
return
end
# In case when 2 runners try to assign the same build, second runner will be declined # In case when 2 runners try to assign the same build, second runner will be declined
# with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method. # with StateMachines::InvalidTransition or StaleObjectError when doing run! or save method.
if assign_runner!(build, params) if assign_runner!(build, params)
present_build!(build) present_build!(build)
end end
rescue StateMachines::InvalidTransition, ActiveRecord::StaleObjectError rescue ActiveRecord::StaleObjectError
# We are looping to find another build that is not conflicting # We are looping to find another build that is not conflicting
# It also indicates that this build can be picked and passed to runner. # It also indicates that this build can be picked and passed to runner.
# If we don't do it, basically a bunch of runners would be competing for a build # If we don't do it, basically a bunch of runners would be competing for a build
...@@ -79,8 +103,16 @@ module Ci ...@@ -79,8 +103,16 @@ module Ci
# In case we hit the concurrency-access lock, # In case we hit the concurrency-access lock,
# we still have to return 409 in the end, # we still have to return 409 in the end,
# to make sure that this is properly handled by runner. # to make sure that this is properly handled by runner.
@metrics.increment_queue_operation(:build_conflict_lock)
Result.new(nil, nil, false)
rescue StateMachines::InvalidTransition
@metrics.increment_queue_operation(:build_conflict_transition)
Result.new(nil, nil, false) Result.new(nil, nil, false)
rescue => ex rescue => ex
@metrics.increment_queue_operation(:build_conflict_exception)
# If an error (e.g. GRPC::DeadlineExceeded) occurred constructing # If an error (e.g. GRPC::DeadlineExceeded) occurred constructing
# the result, consider this as a failure to be retried. # the result, consider this as a failure to be retried.
scheduler_failure!(build) scheduler_failure!(build)
...@@ -106,8 +138,12 @@ module Ci ...@@ -106,8 +138,12 @@ module Ci
failure_reason, _ = pre_assign_runner_checks.find { |_, check| check.call(build, params) } failure_reason, _ = pre_assign_runner_checks.find { |_, check| check.call(build, params) }
if failure_reason if failure_reason
@metrics.increment_queue_operation(:runner_checks_failed)
build.drop!(failure_reason) build.drop!(failure_reason)
else else
@metrics.increment_queue_operation(:runner_checks_success)
build.run! build.run!
end end
......
...@@ -21,6 +21,7 @@ module EE ...@@ -21,6 +21,7 @@ module EE
# "Hi, we don't have any more builds now, but not everything is right anyway, so try again". # "Hi, we don't have any more builds now, but not everything is right anyway, so try again".
# Runner will retry, but again, against replica, and again will check if replication lag did catch-up. # Runner will retry, but again, against replica, and again will check if replication lag did catch-up.
if !db_all_caught_up && !result.build if !db_all_caught_up && !result.build
# TODO replication lag metric
return ::Ci::RegisterJobService::Result.new(nil, false) # rubocop:disable Cop/AvoidReturnFromBlocks return ::Ci::RegisterJobService::Result.new(nil, false) # rubocop:disable Cop/AvoidReturnFromBlocks
end end
end end
......
...@@ -7,10 +7,32 @@ module Gitlab ...@@ -7,10 +7,32 @@ module Gitlab
extend Gitlab::Utils::StrongMemoize extend Gitlab::Utils::StrongMemoize
QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30, 60, 300, 900, 1800, 3600].freeze QUEUE_DURATION_SECONDS_BUCKETS = [1, 3, 10, 30, 60, 300, 900, 1800, 3600].freeze
QUEUE_DEPTH_SIZE_TOTAL_BUCKETS = [1, 5, 10, 50, 100, 500, 1000, 2000, 5000].freeze
QUEUE_ITERATION_DURATION_SECONDS_BUCKETS = [0.1, 0.3, 0.5, 1, 5, 10, 30, 60, 180, 300].freeze
METRICS_SHARD_TAG_PREFIX = 'metrics_shard::' METRICS_SHARD_TAG_PREFIX = 'metrics_shard::'
DEFAULT_METRICS_SHARD = 'default' DEFAULT_METRICS_SHARD = 'default'
JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5.freeze JOBS_RUNNING_FOR_PROJECT_MAX_BUCKET = 5.freeze
OPERATION_COUNTERS = [
:build_can_pick,
:build_not_pick,
:build_conflict_lock,
:build_conflict_exception,
:build_conflict_transition,
:queue_attempt,
:queue_conflict,
:queue_iteration,
:runner_checks_failed,
:runner_checks_success
].freeze
QUEUE_DEPTH_HISTOGRAMS = [
:depth,
:effective,
:ineffective
].freeze
attr_reader :runner attr_reader :runner
def initialize(runner) def initialize(runner)
...@@ -47,6 +69,33 @@ module Gitlab ...@@ -47,6 +69,33 @@ module Gitlab
end end
# rubocop: enable CodeReuse/ActiveRecord # rubocop: enable CodeReuse/ActiveRecord
def increment_queue_operation(operation)
unless OPERATION_COUNTERS.include?(operation)
raise ArgumentError, "unknown queue operation: #{operation}" unless Rails.env.production?
end
self.class.queue_operations_total.increment(operation: operation)
end
def observe_queue_depth(queue, size)
unless QUEUE_DEPTH_HISTOGRAMS.include?(queue)
raise ArgumentError, "unknown queue depth label: #{queue}" unless Rails.env.production?
end
self.class.queue_depth_size_total.observe({ queue: queue }, size.to_f)
end
def observe_queue_time
start_time = ::Gitlab::Metrics::System.monotonic_time
result = yield
seconds = ::Gitlab::Metrics::System.monotonic_time - start_time
self.class.queue_iteration_duration_seconds.observe({}, seconds.to_f)
result
end
def self.failed_attempt_counter def self.failed_attempt_counter
strong_memoize(:failed_attempt_counter) do strong_memoize(:failed_attempt_counter) do
name = :job_register_attempts_failed_total name = :job_register_attempts_failed_total
...@@ -75,6 +124,37 @@ module Gitlab ...@@ -75,6 +124,37 @@ module Gitlab
Gitlab::Metrics.histogram(name, comment, labels, buckets) Gitlab::Metrics.histogram(name, comment, labels, buckets)
end end
end end
def self.queue_operations_total
strong_memoize(:queue_operations_total) do
name = :gitlab_ci_queue_operations_total
comment = 'Counts all the operations that are happening inside a queue'
Gitlab::Metrics.counter(name, comment)
end
end
def self.queue_depth_size_total
strong_memoize(:queue_depth_size_total) do
name = :gitlab_ci_queue_depth_size_total
comment = 'Size of a CI/CD builds queue'
labels = {}
buckets = QUEUE_DEPTH_SIZE_TOTAL_BUCKETS
Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
def self.queue_iteration_duration_seconds
strong_memoize(:queue_iteration_duration_seconds) do
name = :gitlab_ci_queue_iteration_duration_seconds
comment = 'Time it takes to find a build in CI/CD queue'
labels = {}
buckets = QUEUE_ITERATION_DURATION_SECONDS_BUCKETS
Gitlab::Metrics.histogram(name, comment, labels, buckets)
end
end
end end
end end
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment