Commit 2e2f8867 authored by Sean McGivern's avatar Sean McGivern

Log when SidekiqStatus is used unexpectedly

We want to make SidekiqStatus purely opt-in. All known users now set a
`status_expiration` field on the job, either through the
`sidekiq_options` on the worker class, or using
`Worker.with_status.perform_async`.

However, before we start only tracking job statuses for these known
cases, we want to verify that we are not missing any cases. This commit:

1. Makes the client middleware set a different value in Redis when the
   job has opted in (2 instead of 1).
2. Changes the status checking method to log when it finds the default
   value (1), indicating that the job was checked but not opted in.

Because item 2 can only work when the job is enqueued or running, it's
possible we would miss some edge cases that only check job status after
the job finishes. This should be smoothed out across all runs of the
various workers, though: if a worker runs so fast that _all_ of its
status checks show that it is done, then we probably don't need to worry
too much about checking its status anyway!

This is behind the feature flag log_implicit_sidekiq_status_calls, which
is disabled by default. It should be safe to use a feature flag here as
SidekiqStatus isn't read by middleware, only set - it's read by models
and workers, and the model methods will also be called from HTTP
endpoints or workers.
parent 48c542d5
---
name: log_implicit_sidekiq_status_calls
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/74815
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/343964
milestone: '14.6'
type: development
group: group::scalability
default_enabled: false
......@@ -63,7 +63,7 @@ RSpec.describe Geo::RepositoryShardSyncWorker, :geo, :clean_gitlab_redis_cache,
context 'number of scheduled jobs exceeds capacity' do
it 'schedules 0 jobs' do
is_expected.to receive(:scheduled_job_ids).and_return(1..1000).at_least(:once)
is_expected.to receive(:scheduled_job_ids).and_return((1..1000).to_a).at_least(:once)
is_expected.not_to receive(:schedule_job)
Sidekiq::Testing.inline! { subject.perform(shard_name) }
......
......@@ -170,7 +170,7 @@ RSpec.describe Geo::RepositoryVerification::Primary::ShardWorker, :clean_gitlab_
it 'does not schedule jobs when number of scheduled jobs exceeds capacity' do
create(:project)
is_expected.to receive(:scheduled_job_ids).and_return(1..1000).at_least(:once)
is_expected.to receive(:scheduled_job_ids).and_return((1..1000).to_a).at_least(:once)
is_expected.not_to receive(:schedule_job)
Sidekiq::Testing.inline! { subject.perform(shard_name) }
......
......@@ -64,7 +64,7 @@ RSpec.describe Geo::RepositoryVerification::Secondary::ShardWorker, :geo, :reque
it 'does not schedule jobs when number of scheduled jobs exceeds capacity' do
create(:project)
is_expected.to receive(:scheduled_job_ids).and_return(1..1000).at_least(:once)
is_expected.to receive(:scheduled_job_ids).and_return((1..1000).to_a).at_least(:once)
is_expected.not_to receive(:schedule_job)
Sidekiq::Testing.inline! { subject.perform(shard_name) }
......
......@@ -29,13 +29,16 @@ module Gitlab
# for most jobs.
DEFAULT_EXPIRATION = 30.minutes.to_i
DEFAULT_VALUE = 1
DEFAULT_VALUE_MESSAGE = 'Keys using the default value for SidekiqStatus detected'
# Starts tracking of the given job.
#
# jid - The Sidekiq job ID
# expire - The expiration time of the Redis key.
def self.set(jid, expire = DEFAULT_EXPIRATION)
def self.set(jid, expire = DEFAULT_EXPIRATION, value: DEFAULT_VALUE)
Sidekiq.redis do |redis|
redis.set(key_for(jid), 1, ex: expire)
redis.set(key_for(jid), value, ex: expire)
end
end
......@@ -88,13 +91,20 @@ module Gitlab
# true = job is still running or enqueued
# false = job completed
def self.job_status(job_ids)
return [] if job_ids.empty?
keys = job_ids.map { |jid| key_for(jid) }
results = Sidekiq.redis { |redis| redis.mget(*keys) }
Sidekiq.redis do |redis|
redis.pipelined do
keys.each { |key| redis.exists(key) }
end
if Feature.enabled?(:log_implicit_sidekiq_status_calls, default_enabled: :yaml)
to_log = keys.zip(results).select do |_key, result|
result == DEFAULT_VALUE.to_s
end.map(&:first)
Sidekiq.logger.info(message: DEFAULT_VALUE_MESSAGE, keys: to_log) if to_log.any?
end
results.map { |result| !result.nil? }
end
# Returns the JIDs that are completed
......
......@@ -5,8 +5,9 @@ module Gitlab
class ClientMiddleware
def call(_, job, _, _)
status_expiration = job['status_expiration'] || Gitlab::SidekiqStatus::DEFAULT_EXPIRATION
value = job['status_expiration'] ? 2 : Gitlab::SidekiqStatus::DEFAULT_VALUE
Gitlab::SidekiqStatus.set(job['jid'], status_expiration)
Gitlab::SidekiqStatus.set(job['jid'], status_expiration, value: value)
yield
end
end
......
# frozen_string_literal: true
require 'spec_helper'
require 'fast_spec_helper'
RSpec.describe Gitlab::SidekiqStatus::ClientMiddleware do
describe '#call' do
it 'tracks the job in Redis' do
expect(Gitlab::SidekiqStatus).to receive(:set).with('123', Gitlab::SidekiqStatus::DEFAULT_EXPIRATION)
context 'when the job has status_expiration set' do
it 'tracks the job in Redis with a value of 2' do
expect(Gitlab::SidekiqStatus).to receive(:set).with('123', 1.hour.to_i, value: 2)
described_class.new
.call('Foo', { 'jid' => '123' }, double(:queue), double(:pool)) { nil }
described_class.new
.call('Foo', { 'jid' => '123', 'status_expiration' => 1.hour.to_i }, double(:queue), double(:pool)) { nil }
end
end
context 'when the job does not have status_expiration set' do
it 'tracks the job in Redis with a value of 1' do
expect(Gitlab::SidekiqStatus).to receive(:set).with('123', Gitlab::SidekiqStatus::DEFAULT_EXPIRATION, value: 1)
described_class.new
.call('Foo', { 'jid' => '123' }, double(:queue), double(:pool)) { nil }
end
end
end
end
......@@ -12,6 +12,31 @@ RSpec.describe Gitlab::SidekiqStatus, :clean_gitlab_redis_queues, :clean_gitlab_
Sidekiq.redis do |redis|
expect(redis.exists(key)).to eq(true)
expect(redis.ttl(key) > 0).to eq(true)
expect(redis.get(key)).to eq(described_class::DEFAULT_VALUE.to_s)
end
end
it 'allows overriding the expiration time' do
described_class.set('123', described_class::DEFAULT_EXPIRATION * 2)
key = described_class.key_for('123')
Sidekiq.redis do |redis|
expect(redis.exists(key)).to eq(true)
expect(redis.ttl(key) > described_class::DEFAULT_EXPIRATION).to eq(true)
expect(redis.get(key)).to eq(described_class::DEFAULT_VALUE.to_s)
end
end
it 'allows overriding the default value' do
described_class.set('123', value: 2)
key = described_class.key_for('123')
Sidekiq.redis do |redis|
expect(redis.exists(key)).to eq(true)
expect(redis.ttl(key) > 0).to eq(true)
expect(redis.get(key)).to eq('2')
end
end
end
......@@ -88,7 +113,7 @@ RSpec.describe Gitlab::SidekiqStatus, :clean_gitlab_redis_queues, :clean_gitlab_
end
end
describe 'completed' do
describe '.completed_jids' do
it 'returns the completed job' do
expect(described_class.completed_jids(%w(123))).to eq(['123'])
end
......@@ -100,4 +125,46 @@ RSpec.describe Gitlab::SidekiqStatus, :clean_gitlab_redis_queues, :clean_gitlab_
expect(described_class.completed_jids(%w(123 456 789))).to eq(['789'])
end
end
describe '.job_status' do
it 'returns an array of boolean values' do
described_class.set('123')
described_class.set('456')
described_class.unset('123')
expect(described_class.job_status(%w(123 456 789))).to eq([false, true, false])
end
it 'handles an empty array' do
expect(described_class.job_status([])).to eq([])
end
context 'when log_implicit_sidekiq_status_calls is enabled' do
it 'logs keys that contained the default value' do
described_class.set('123', value: 2)
described_class.set('456')
described_class.set('012')
expect(Sidekiq.logger).to receive(:info).with(message: described_class::DEFAULT_VALUE_MESSAGE,
keys: [described_class.key_for('456'), described_class.key_for('012')])
expect(described_class.job_status(%w(123 456 789 012))).to eq([true, true, false, true])
end
end
context 'when log_implicit_sidekiq_status_calls is disabled' do
before do
stub_feature_flags(log_implicit_sidekiq_status_calls: false)
end
it 'does not perform any logging' do
described_class.set('123', value: 2)
described_class.set('456')
expect(Sidekiq.logger).not_to receive(:info)
expect(described_class.job_status(%w(123 456 789))).to eq([true, true, false])
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment