Merge branch 'an-mark-jobs-as-latency-sensitive' into 'master'

Attribute all Sidekiq workers according to their workloads See merge request gitlab-org/gitlab!18066

Merge branch 'an-mark-jobs-as-latency-sensitive' into 'master'
Attribute all Sidekiq workers according to their workloads See merge request gitlab-org/gitlab!18066
6fa15983 · Kamil Trzciński · 629dda38 · 87fa73c5 · 6fa15983 · 6fa15983
Commit 6fa15983 authored Oct 30, 2019 by Kamil Trzciński
73 changed files
--- a/app/models/concerns/worker_attributes.rb
+++ b/app/models/concerns/worker_attributes.rb
@@ -3,6 +3,10 @@
 module WorkerAttributes
  extend ActiveSupport::Concern

+  # Resource boundaries that workers can declare through the
+  # `worker_resource_boundary` attribute
+  VALID_RESOURCE_BOUNDARIES = [:memory, :cpu, :unknown].freeze
+
  class_methods do
    def feature_category(value)
      raise "Invalid category. Use `feature_category_not_owned!` to mark a worker as not owned" if value == :not_owned
@@ -24,6 +28,48 @@ module WorkerAttributes
      get_worker_attribute(:feature_category) == :not_owned
    end

+    # This should be set for jobs that need to be run immediately, or, if
+    # they are delayed, risk creating inconsistencies in the application
+    # that could being perceived by the user as incorrect behavior
+    # (ie, a bug)
+    # See doc/development/sidekiq_style_guide.md#Latency-Sensitive-Jobs
+    # for details
+    def latency_sensitive_worker!
+      worker_attributes[:latency_sensitive] = true
+    end
+
+    # Returns a truthy value if the worker is latency sensitive.
+    # See doc/development/sidekiq_style_guide.md#Latency-Sensitive-Jobs
+    # for details
+    def latency_sensitive_worker?
+      worker_attributes[:latency_sensitive]
+    end
+
+    # Set this attribute on a job when it will call to services outside of the
+    # application, such as 3rd party applications, other k8s clusters etc See
+    # doc/development/sidekiq_style_guide.md#Jobs-with-External-Dependencies for
+    # details
+    def worker_has_external_dependencies!
+      worker_attributes[:external_dependencies] = true
+    end
+
+    # Returns a truthy value if the worker has external dependencies.
+    # See doc/development/sidekiq_style_guide.md#Jobs-with-External-Dependencies
+    # for details
+    def worker_has_external_dependencies?
+      worker_attributes[:external_dependencies]
+    end
+
+    def worker_resource_boundary(boundary)
+      raise "Invalid boundary" unless VALID_RESOURCE_BOUNDARIES.include? boundary
+
+      worker_attributes[:resource_boundary] = boundary
+    end
+
+    def get_worker_resource_boundary
+      worker_attributes[:resource_boundary] || :unknown
+    end
+
    protected

    # Returns a worker attribute declared on this class or its parent class.

--- a/app/workers/authorized_projects_worker.rb
+++ b/app/workers/authorized_projects_worker.rb
@@ -5,6 +5,7 @@ class AuthorizedProjectsWorker
  prepend WaitableWorker

  feature_category :authentication_and_authorization
+  latency_sensitive_worker!

  # This is a workaround for a Ruby 2.3.7 bug. rspec-mocks cannot restore the
  # visibility of prepended modules. See https://github.com/rspec/rspec-mocks/issues/1231

--- a/app/workers/build_finished_worker.rb
+++ b/app/workers/build_finished_worker.rb
@@ -5,6 +5,8 @@ class BuildFinishedWorker
  include PipelineQueue

  queue_namespace :pipeline_processing
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(build_id)

--- a/app/workers/build_hooks_worker.rb
+++ b/app/workers/build_hooks_worker.rb
@@ -6,6 +6,7 @@ class BuildHooksWorker

  queue_namespace :pipeline_hooks
  feature_category :continuous_integration
+  latency_sensitive_worker!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(build_id)

--- a/app/workers/build_queue_worker.rb
+++ b/app/workers/build_queue_worker.rb
@@ -6,6 +6,8 @@ class BuildQueueWorker

  queue_namespace :pipeline_processing
  feature_category :continuous_integration
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(build_id)

--- a/app/workers/build_success_worker.rb
+++ b/app/workers/build_success_worker.rb
@@ -5,6 +5,7 @@ class BuildSuccessWorker
  include PipelineQueue

  queue_namespace :pipeline_processing
+  latency_sensitive_worker!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(build_id)

--- a/app/workers/chat_notification_worker.rb
+++ b/app/workers/chat_notification_worker.rb
@@ -4,6 +4,11 @@ class ChatNotificationWorker
  include ApplicationWorker

  feature_category :chatops
+  latency_sensitive_worker!
+  # TODO: break this into multiple jobs
+  # as the `responder` uses external dependencies
+  # See https://gitlab.com/gitlab-com/gl-infra/scalability/issues/34
+  # worker_has_external_dependencies!

  RESCHEDULE_INTERVAL = 2.seconds


--- a/app/workers/ci/build_schedule_worker.rb
+++ b/app/workers/ci/build_schedule_worker.rb
@@ -7,6 +7,7 @@ module Ci

    queue_namespace :pipeline_processing
    feature_category :continuous_integration
+    worker_resource_boundary :cpu

    def perform(build_id)
      ::Ci::Build.find_by_id(build_id).try do |build|

--- a/app/workers/cluster_install_app_worker.rb
+++ b/app/workers/cluster_install_app_worker.rb
@@ -5,6 +5,8 @@ class ClusterInstallAppWorker
  include ClusterQueue
  include ClusterApplications

+  worker_has_external_dependencies!
+
  def perform(app_name, app_id)
    find_application(app_name, app_id) do |app|
      Clusters::Applications::InstallService.new(app).execute

--- a/app/workers/cluster_patch_app_worker.rb
+++ b/app/workers/cluster_patch_app_worker.rb
@@ -5,6 +5,8 @@ class ClusterPatchAppWorker
  include ClusterQueue
  include ClusterApplications

+  worker_has_external_dependencies!
+
  def perform(app_name, app_id)
    find_application(app_name, app_id) do |app|
      Clusters::Applications::PatchService.new(app).execute

--- a/app/workers/cluster_project_configure_worker.rb
+++ b/app/workers/cluster_project_configure_worker.rb
@@ -4,6 +4,8 @@ class ClusterProjectConfigureWorker
  include ApplicationWorker
  include ClusterQueue

+  worker_has_external_dependencies!
+
  def perform(project_id)
    # Scheduled for removal in https://gitlab.com/gitlab-org/gitlab-foss/issues/59319
  end

--- a/app/workers/cluster_provision_worker.rb
+++ b/app/workers/cluster_provision_worker.rb
@@ -4,6 +4,8 @@ class ClusterProvisionWorker
  include ApplicationWorker
  include ClusterQueue

+  worker_has_external_dependencies!
+
  def perform(cluster_id)
    Clusters::Cluster.find_by_id(cluster_id).try do |cluster|
      cluster.provider.try do |provider|

--- a/app/workers/cluster_upgrade_app_worker.rb
+++ b/app/workers/cluster_upgrade_app_worker.rb
@@ -5,6 +5,8 @@ class ClusterUpgradeAppWorker
  include ClusterQueue
  include ClusterApplications

+  worker_has_external_dependencies!
+
  def perform(app_name, app_id)
    find_application(app_name, app_id) do |app|
      Clusters::Applications::UpgradeService.new(app).execute

--- a/app/workers/cluster_wait_for_app_installation_worker.rb
+++ b/app/workers/cluster_wait_for_app_installation_worker.rb
@@ -8,6 +8,9 @@ class ClusterWaitForAppInstallationWorker
  INTERVAL = 10.seconds
  TIMEOUT = 20.minutes

+  worker_has_external_dependencies!
+  worker_resource_boundary :cpu
+
  def perform(app_name, app_id)
    find_application(app_name, app_id) do |app|
      Clusters::Applications::CheckInstallationProgressService.new(app).execute

--- a/app/workers/cluster_wait_for_ingress_ip_address_worker.rb
+++ b/app/workers/cluster_wait_for_ingress_ip_address_worker.rb
@@ -5,6 +5,8 @@ class ClusterWaitForIngressIpAddressWorker
  include ClusterQueue
  include ClusterApplications

+  worker_has_external_dependencies!
+
  def perform(app_name, app_id)
    find_application(app_name, app_id) do |app|
      Clusters::Applications::CheckIngressIpAddressService.new(app).execute

--- a/app/workers/clusters/applications/uninstall_worker.rb
+++ b/app/workers/clusters/applications/uninstall_worker.rb
@@ -7,6 +7,8 @@ module Clusters
      include ClusterQueue
      include ClusterApplications

+      worker_has_external_dependencies!
+
      def perform(app_name, app_id)
        find_application(app_name, app_id) do |app|
          Clusters::Applications::UninstallService.new(app).execute

--- a/app/workers/clusters/applications/wait_for_uninstall_app_worker.rb
+++ b/app/workers/clusters/applications/wait_for_uninstall_app_worker.rb
@@ -10,6 +10,9 @@ module Clusters
      INTERVAL = 10.seconds
      TIMEOUT = 20.minutes

+      worker_has_external_dependencies!
+      worker_resource_boundary :cpu
+
      def perform(app_name, app_id)
        find_application(app_name, app_id) do |app|
          Clusters::Applications::CheckUninstallProgressService.new(app).execute

--- a/app/workers/concerns/gitlab/github_import/object_importer.rb
+++ b/app/workers/concerns/gitlab/github_import/object_importer.rb
@@ -14,6 +14,7 @@ module Gitlab
        include NotifyUponDeath

        feature_category :importers
+        worker_has_external_dependencies!
      end

      # project - An instance of `Project` to import the data into.

--- a/app/workers/create_pipeline_worker.rb
+++ b/app/workers/create_pipeline_worker.rb
@@ -6,6 +6,8 @@ class CreatePipelineWorker

  queue_namespace :pipeline_creation
  feature_category :continuous_integration
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(project_id, user_id, ref, source, params = {})
    project = Project.find(project_id)

--- a/app/workers/deployments/finished_worker.rb
+++ b/app/workers/deployments/finished_worker.rb
@@ -6,6 +6,7 @@ module Deployments

    queue_namespace :deployment
    feature_category :continuous_delivery
+    worker_resource_boundary :cpu

    def perform(deployment_id)
      Deployment.find_by_id(deployment_id).try(:execute_hooks)

--- a/app/workers/deployments/success_worker.rb
+++ b/app/workers/deployments/success_worker.rb
@@ -6,6 +6,7 @@ module Deployments

    queue_namespace :deployment
    feature_category :continuous_delivery
+    worker_resource_boundary :cpu

    def perform(deployment_id)
      Deployment.find_by_id(deployment_id).try do |deployment|

--- a/app/workers/email_receiver_worker.rb
+++ b/app/workers/email_receiver_worker.rb
@@ -4,6 +4,7 @@ class EmailReceiverWorker
  include ApplicationWorker

  feature_category :issue_tracking
+  latency_sensitive_worker!

  def perform(raw)
    return unless Gitlab::IncomingEmail.enabled?

--- a/app/workers/emails_on_push_worker.rb
+++ b/app/workers/emails_on_push_worker.rb
@@ -6,6 +6,8 @@ class EmailsOnPushWorker
  attr_reader :email, :skip_premailer

  feature_category :source_code_management
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(project_id, recipients, push_data, options = {})
    options.symbolize_keys!

--- a/app/workers/expire_job_cache_worker.rb
+++ b/app/workers/expire_job_cache_worker.rb
@@ -5,6 +5,7 @@ class ExpireJobCacheWorker
  include PipelineQueue

  queue_namespace :pipeline_cache
+  latency_sensitive_worker!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(job_id)

--- a/app/workers/expire_pipeline_cache_worker.rb
+++ b/app/workers/expire_pipeline_cache_worker.rb
@@ -5,6 +5,8 @@ class ExpirePipelineCacheWorker
  include PipelineQueue

  queue_namespace :pipeline_cache
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(pipeline_id)

--- a/app/workers/gitlab_shell_worker.rb
+++ b/app/workers/gitlab_shell_worker.rb
@@ -5,6 +5,7 @@ class GitlabShellWorker
  include Gitlab::ShellAdapter

  feature_category :source_code_management
+  latency_sensitive_worker!

  def perform(action, *arg)
    gitlab_shell.__send__(action, *arg) # rubocop:disable GitlabSecurity/PublicSend

--- a/app/workers/import_issues_csv_worker.rb
+++ b/app/workers/import_issues_csv_worker.rb
@@ -4,6 +4,7 @@ class ImportIssuesCsvWorker
  include ApplicationWorker

  feature_category :issue_tracking
+  worker_resource_boundary :cpu

  sidekiq_retries_exhausted do |job|
    Upload.find(job['args'][2]).destroy

--- a/app/workers/mail_scheduler/notification_service_worker.rb
+++ b/app/workers/mail_scheduler/notification_service_worker.rb
@@ -8,6 +8,7 @@ module MailScheduler
    include MailSchedulerQueue

    feature_category :issue_tracking
+    worker_resource_boundary :cpu

    def perform(meth, *args)
      check_arguments!(args)

--- a/app/workers/merge_worker.rb
+++ b/app/workers/merge_worker.rb
@@ -4,6 +4,7 @@ class MergeWorker
  include ApplicationWorker

  feature_category :source_code_management
+  latency_sensitive_worker!

  def perform(merge_request_id, current_user_id, params)
    params = params.with_indifferent_access

--- a/app/workers/namespaces/prune_aggregation_schedules_worker.rb
+++ b/app/workers/namespaces/prune_aggregation_schedules_worker.rb
@@ -6,6 +6,7 @@ module Namespaces
    include CronjobQueue

    feature_category :source_code_management
+    worker_resource_boundary :cpu

    # Worker to prune pending rows on Namespace::AggregationSchedule
    # It's scheduled to run once a day at 1:05am.

--- a/app/workers/new_issue_worker.rb
+++ b/app/workers/new_issue_worker.rb
@@ -5,6 +5,8 @@ class NewIssueWorker
  include NewIssuable

  feature_category :issue_tracking
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(issue_id, user_id)
    return unless objects_found?(issue_id, user_id)

--- a/app/workers/new_merge_request_worker.rb
+++ b/app/workers/new_merge_request_worker.rb
@@ -5,6 +5,8 @@ class NewMergeRequestWorker
  include NewIssuable

  feature_category :source_code_management
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(merge_request_id, user_id)
    return unless objects_found?(merge_request_id, user_id)

--- a/app/workers/new_note_worker.rb
+++ b/app/workers/new_note_worker.rb
@@ -4,6 +4,8 @@ class NewNoteWorker
  include ApplicationWorker

  feature_category :issue_tracking
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  # Keep extra parameter to preserve backwards compatibility with
  # old `NewNoteWorker` jobs (can remove later)

--- a/app/workers/object_pool/join_worker.rb
+++ b/app/workers/object_pool/join_worker.rb
@@ -5,6 +5,8 @@ module ObjectPool
    include ApplicationWorker
    include ObjectPoolQueue

+    worker_resource_boundary :cpu
+
    # The use of pool id is deprecated. Keeping the argument allows old jobs to
    # still be performed.
    def perform(_pool_id, project_id)

--- a/app/workers/pages_domain_removal_cron_worker.rb
+++ b/app/workers/pages_domain_removal_cron_worker.rb
@@ -5,6 +5,7 @@ class PagesDomainRemovalCronWorker
  include CronjobQueue

  feature_category :pages
+  worker_resource_boundary :cpu

  def perform
    PagesDomain.for_removal.find_each do |domain|

--- a/app/workers/pipeline_hooks_worker.rb
+++ b/app/workers/pipeline_hooks_worker.rb
@@ -5,6 +5,8 @@ class PipelineHooksWorker
  include PipelineQueue

  queue_namespace :pipeline_hooks
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(pipeline_id)

--- a/app/workers/pipeline_metrics_worker.rb
+++ b/app/workers/pipeline_metrics_worker.rb
@@ -4,6 +4,8 @@ class PipelineMetricsWorker
  include ApplicationWorker
  include PipelineQueue

+  latency_sensitive_worker!
+
  # rubocop: disable CodeReuse/ActiveRecord
  def perform(pipeline_id)
    Ci::Pipeline.find_by(id: pipeline_id).try do |pipeline|

--- a/app/workers/pipeline_notification_worker.rb
+++ b/app/workers/pipeline_notification_worker.rb
@@ -4,6 +4,9 @@ class PipelineNotificationWorker
  include ApplicationWorker
  include PipelineQueue

+  latency_sensitive_worker!
+  worker_resource_boundary :cpu
+
  # rubocop: disable CodeReuse/ActiveRecord
  def perform(pipeline_id, recipients = nil)
    pipeline = Ci::Pipeline.find_by(id: pipeline_id)

--- a/app/workers/pipeline_process_worker.rb
+++ b/app/workers/pipeline_process_worker.rb
@@ -6,6 +6,7 @@ class PipelineProcessWorker

  queue_namespace :pipeline_processing
  feature_category :continuous_integration
+  latency_sensitive_worker!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(pipeline_id, build_ids = nil)

--- a/app/workers/pipeline_schedule_worker.rb
+++ b/app/workers/pipeline_schedule_worker.rb
@@ -5,6 +5,7 @@ class PipelineScheduleWorker
  include CronjobQueue

  feature_category :continuous_integration
+  worker_resource_boundary :cpu

  def perform
    Ci::PipelineSchedule.runnable_schedules.preloaded.find_in_batches do |schedules|

--- a/app/workers/pipeline_success_worker.rb
+++ b/app/workers/pipeline_success_worker.rb
@@ -5,6 +5,7 @@ class PipelineSuccessWorker
  include PipelineQueue

  queue_namespace :pipeline_processing
+  latency_sensitive_worker!

  def perform(pipeline_id)
    # no-op

--- a/app/workers/pipeline_update_worker.rb
+++ b/app/workers/pipeline_update_worker.rb
@@ -5,6 +5,7 @@ class PipelineUpdateWorker
  include PipelineQueue

  queue_namespace :pipeline_processing
+  latency_sensitive_worker!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(pipeline_id)

--- a/app/workers/post_receive.rb
+++ b/app/workers/post_receive.rb
@@ -4,6 +4,8 @@ class PostReceive
  include ApplicationWorker

  feature_category :source_code_management
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(gl_repository, identifier, changes, push_options = {})
    project, repo_type = Gitlab::GlRepository.parse(gl_repository)

--- a/app/workers/process_commit_worker.rb
+++ b/app/workers/process_commit_worker.rb
@@ -11,6 +11,7 @@ class ProcessCommitWorker
  include ApplicationWorker

  feature_category :source_code_management
+  latency_sensitive_worker!

  # project_id - The ID of the project this commit belongs to.
  # user_id - The ID of the user that pushed the commit.

--- a/app/workers/project_cache_worker.rb
+++ b/app/workers/project_cache_worker.rb
@@ -3,6 +3,9 @@
 # Worker for updating any project specific caches.
 class ProjectCacheWorker
  include ApplicationWorker
+
+  latency_sensitive_worker!
+
  LEASE_TIMEOUT = 15.minutes.to_i

  feature_category :source_code_management

--- a/app/workers/project_export_worker.rb
+++ b/app/workers/project_export_worker.rb
@@ -6,6 +6,7 @@ class ProjectExportWorker

  sidekiq_options retry: 3
  feature_category :source_code_management
+  worker_resource_boundary :memory

  def perform(current_user_id, project_id, after_export_strategy = {}, params = {})
    current_user = User.find(current_user_id)

--- a/app/workers/project_service_worker.rb
+++ b/app/workers/project_service_worker.rb
@@ -5,6 +5,7 @@ class ProjectServiceWorker

  sidekiq_options dead: false
  feature_category :integrations
+  worker_has_external_dependencies!

  def perform(hook_id, data)
    data = data.with_indifferent_access

--- a/app/workers/reactive_caching_worker.rb
+++ b/app/workers/reactive_caching_worker.rb
@@ -5,6 +5,14 @@ class ReactiveCachingWorker

  feature_category_not_owned!

+  # TODO: The reactive caching worker should be split into
+  # two different workers, one for latency_sensitive jobs without external dependencies
+  # and another worker without latency_sensitivity, but with external dependencies
+  # https://gitlab.com/gitlab-com/gl-infra/scalability/issues/34
+  # This worker should also have `worker_has_external_dependencies!` enabled
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu
+
  def perform(class_name, id, *args)
    klass = begin
      class_name.constantize

--- a/app/workers/remove_expired_members_worker.rb
+++ b/app/workers/remove_expired_members_worker.rb
@@ -5,6 +5,7 @@ class RemoveExpiredMembersWorker
  include CronjobQueue

  feature_category :authentication_and_authorization
+  worker_resource_boundary :cpu

  def perform
    Member.expired.find_each do |member|

--- a/app/workers/repository_import_worker.rb
+++ b/app/workers/repository_import_worker.rb
@@ -7,6 +7,7 @@ class RepositoryImportWorker
  include ProjectImportOptions

  feature_category :importers
+  worker_has_external_dependencies!

  # technical debt: https://gitlab.com/gitlab-org/gitlab/issues/33991
  sidekiq_options memory_killer_memory_growth_kb: ENV.fetch('MEMORY_KILLER_REPOSITORY_IMPORT_WORKER_MEMORY_GROWTH_KB', 50).to_i

--- a/app/workers/repository_update_remote_mirror_worker.rb
+++ b/app/workers/repository_update_remote_mirror_worker.rb
@@ -6,6 +6,8 @@ class RepositoryUpdateRemoteMirrorWorker
  include ApplicationWorker
  include Gitlab::ExclusiveLeaseHelpers

+  worker_has_external_dependencies!
+
  sidekiq_options retry: 3, dead: false
  feature_category :source_code_management


--- a/app/workers/stage_update_worker.rb
+++ b/app/workers/stage_update_worker.rb
@@ -5,6 +5,7 @@ class StageUpdateWorker
  include PipelineQueue

  queue_namespace :pipeline_processing
+  latency_sensitive_worker!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(stage_id)

--- a/app/workers/stuck_ci_jobs_worker.rb
+++ b/app/workers/stuck_ci_jobs_worker.rb
@@ -5,6 +5,7 @@ class StuckCiJobsWorker
  include CronjobQueue

  feature_category :continuous_integration
+  worker_resource_boundary :cpu

  EXCLUSIVE_LEASE_KEY = 'stuck_ci_builds_worker_lease'


--- a/app/workers/stuck_import_jobs_worker.rb
+++ b/app/workers/stuck_import_jobs_worker.rb
@@ -5,6 +5,7 @@ class StuckImportJobsWorker
  include CronjobQueue

  feature_category :importers
+  worker_resource_boundary :cpu

  IMPORT_JOBS_EXPIRATION = 15.hours.to_i


--- a/app/workers/update_head_pipeline_for_merge_request_worker.rb
+++ b/app/workers/update_head_pipeline_for_merge_request_worker.rb
@@ -6,6 +6,8 @@ class UpdateHeadPipelineForMergeRequestWorker

  queue_namespace :pipeline_processing
  feature_category :continuous_integration
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(merge_request_id)
    MergeRequest.find_by_id(merge_request_id).try do |merge_request|

--- a/app/workers/update_merge_requests_worker.rb
+++ b/app/workers/update_merge_requests_worker.rb
@@ -4,6 +4,8 @@ class UpdateMergeRequestsWorker
  include ApplicationWorker

  feature_category :source_code_management
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  LOG_TIME_THRESHOLD = 90 # seconds


--- a/app/workers/wait_for_cluster_creation_worker.rb
+++ b/app/workers/wait_for_cluster_creation_worker.rb
@@ -4,6 +4,8 @@ class WaitForClusterCreationWorker
  include ApplicationWorker
  include ClusterQueue

+  worker_has_external_dependencies!
+
  def perform(cluster_id)
    Clusters::Cluster.find_by_id(cluster_id).try do |cluster|
      cluster.provider.try do |provider|

--- a/app/workers/web_hook_worker.rb
+++ b/app/workers/web_hook_worker.rb
@@ -4,6 +4,8 @@ class WebHookWorker
  include ApplicationWorker

  feature_category :integrations
+  worker_has_external_dependencies!
+
  sidekiq_options retry: 4, dead: false

  def perform(hook_id, data, hook_name)

--- a/changelogs/unreleased/an-mark-jobs-as-latency-sensitive.yml
+++ b/changelogs/unreleased/an-mark-jobs-as-latency-sensitive.yml
+---
+title: Attribute Sidekiq workers according to their workloads
+merge_request: 18066
+author:
+type: other
--- a/doc/development/code_review.md
+++ b/doc/development/code_review.md
@@ -368,7 +368,7 @@ Enterprise Edition instance. This has some implications:
      - [Background migrations](background_migrations.md) run in Sidekiq, and
        should only be done for migrations that would take an extreme amount of
        time at GitLab.com scale.
-1. **Sidekiq workers** [cannot change in a backwards-incompatible way](sidekiq_style_guide.md#removing-or-renaming-queues):
+1. **Sidekiq workers** [cannot change in a backwards-incompatible way](sidekiq_style_guide.md#sidekiq-compatibility-across-updates):
   1. Sidekiq queues are not drained before a deploy happens, so there will be
      workers in the queue from the previous version of GitLab.
   1. If you need to change a method signature, try to do so across two releases,

--- a/doc/development/sidekiq_style_guide.md
+++ b/doc/development/sidekiq_style_guide.md
@@ -61,6 +61,168 @@ the extra jobs will take resources away from jobs from workers that were already
 there, if the resources available to the Sidekiq process handling the namespace
 are not adjusted appropriately.

+## Latency Sensitive Jobs
+
+If a large number of background jobs get scheduled at once, queueing of jobs may
+occur while jobs wait for a worker node to be become available. This is normal
+and gives the system resilience by allowing it to gracefully handle spikes in
+traffic. Some jobs, however, are more sensitive to latency than others. Examples
+of these jobs include:
+
+1. A job which updates a merge request following a push to a branch.
+1. A job which invalidates a cache of known branches for a project after a push
+   to the branch.
+1. A job which recalculates the groups and projects a user can see after a
+   change in permissions.
+1. A job which updates the status of a CI pipeline after a state change to a job
+   in the pipeline.
+
+When these jobs are delayed, the user may perceive the delay as a bug: for
+example, they may push a branch and then attempt to create a merge request for
+that branch, but be told in the UI that the branch does not exist. We deem these
+jobs to be `latency_sensitive`.
+
+Extra effort is made to ensure that these jobs are started within a very short
+period of time after being scheduled. However, in order to ensure throughput,
+these jobs also have very strict execution duration requirements:
+
+1. The median job execution time should be less than 1 second.
+1. 99% of jobs should complete within 10 seconds.
+
+If a worker cannot meet these expectations, then it cannot be treated as a
+`latency_sensitive` worker: consider redesigning the worker, or splitting the
+work between two different workers, one with `latency_sensitive` code that
+executes quickly, and the other with non-`latency_sensitive`, which has no
+execution latency requirements (but also has lower scheduling targets).
+
+This can be summed up in the following table:
+
+| **Latency Sensitivity** | **Queue Scheduling Target** | **Execution Latency Requirement**   |
+|-------------------------|-----------------------------|-------------------------------------|
+| Not `latency_sensitive` | 1 minute                    | Maximum run time of 1 hour          |
+| `latency_sensitive`     | 100 milliseconds            | p50 of 1 second, p99 of 10 seconds  |
+
+To mark a worker as being `latency_sensitive`, use the
+`latency_sensitive_worker!` attribute, as shown in this example:
+
+```ruby
+class LatencySensitiveWorker
+  include ApplicationWorker
+
+  latency_sensitive_worker!
+
+  # ...
+end
+```
+
+## Jobs with External Dependencies
+
+Most background jobs in the GitLab application communicate with other GitLab
+services, eg Postgres, Redis, Gitaly and Object Storage. These are considered
+to be "internal" dependencies for a job.
+
+However, some jobs will be dependent on external services in order to complete
+successfully. Some examples include:
+
+1. Jobs which call web-hooks configured by a user.
+1. Jobs which deploy an application to a k8s cluster configured by a user.
+
+These jobs have "external dependencies". This is important for the operation of
+the background processing cluster in several ways:
+
+1. Most external dependencies (such as web-hooks) do not provide SLOs, and
+   therefore we cannot guarantee the execution latencies on these jobs. Since we
+   cannot guarantee execution latency, we cannot ensure throughput and
+   therefore, in high-traffic environments, we need to ensure that jobs with
+   external dependencies are separated from `latency_sensitive` jobs, to ensure
+   throughput on those queues.
+1. Errors in jobs with external dependencies have higher alerting thresholds as
+   there is a likelihood that the cause of the error is external.
+
+```ruby
+class ExternalDependencyWorker
+  include ApplicationWorker
+
+  # Declares that this worker depends on
+  # third-party, external services in order
+  # to complete successfully
+  worker_has_external_dependencies!
+
+  # ...
+end
+```
+
+NOTE: **Note:** Note that a job cannot be both latency sensitive and have
+external dependencies.
+
+## CPU-bound and Memory-bound Workers
+
+Workers that are constrained by CPU or memory resource limitations should be
+annotated with the `worker_resource_boundary` method.
+
+Most workers tend to spend most of their time blocked, wait on network responses
+from other services such as Redis, Postgres and Gitaly. Since Sidekiq is a
+multithreaded environment, these jobs can be scheduled with high concurrency.
+
+Some workers, however, spend large amounts of time _on-cpu_ running logic in
+Ruby. Ruby MRI does not support true multithreading - it relies on the
+[GIL](https://thoughtbot.com/blog/untangling-ruby-threads#the-global-interpreter-lock)
+to greatly simplify application development by only allowing one section of Ruby
+code in a process to run at a time, no matter how many cores the machine
+hosting the process has. For IO bound workers, this is not a problem, since most
+of the threads are blocked in underlying libraries (which are outside of the
+GIL).
+
+If many threads are attempting to run Ruby code simultaneously, this will lead
+to contention on the GIL which will have the affect of slowing down all
+processes.
+
+In high-traffic environments, knowing that a worker is CPU-bound allows us to
+run it on a different fleet with lower concurrency. This ensures optimal
+performance.
+
+Likewise, if a worker uses large amounts of memory, we can run these on a
+bespoke low concurrency, high memory fleet.
+
+Note that Memory-bound workers create heavy GC workloads, with pauses of
+10-50ms. This will have an impact on the latency requirements for the
+worker. For this reason, `memory` bound, `latency_sensitive` jobs are not
+permitted and will fail CI. In general, `memory` bound workers are
+discouraged, and alternative approaches to processing the work should be
+considered.
+
+## Declaring a Job as CPU-bound
+
+This example shows how to declare a job as being CPU-bound.
+
+```ruby
+class CPUIntensiveWorker
+  include ApplicationWorker
+
+  # Declares that this worker will perform a lot of
+  # calculations on-CPU.
+  worker_resource_boundary :cpu
+
+  # ...
+end
+```
+
+## Determining whether a worker is CPU-bound
+
+We use the following approach to determine whether a worker is CPU-bound:
+
+- In the sidekiq structured JSON logs, aggregate the worker `duration` and
+  `cpu_s` fields.
+- `duration` refers to the total job execution duration, in seconds
+- `cpu_s` is derived from the
+  [`Process::CLOCK_THREAD_CPUTIME_ID`](https://www.rubydoc.info/stdlib/core/Process:clock_gettime)
+  counter, and is a measure of time spent by the job on-CPU.
+- Divide `cpu_s` by `duration` to get the percentage time spend on-CPU.
+- If this ratio exceeds 33%, the worker is considered CPU-bound and should be
+  annotated as such.
+- Note that these values should not be used over small sample sizes, but
+  rather over fairly large aggregates.
+
 ## Feature Categorization

 Each Sidekiq worker, or one of its ancestor classes, must declare a
@@ -74,7 +236,7 @@ The declaration uses the `feature_category` class method, as shown below.
 class SomeScheduledTaskWorker
  include ApplicationWorker

-  # Declares that this feature is part of the
+  # Declares that this worker is part of the
  # `continuous_integration` feature category
  feature_category :continuous_integration

@@ -88,11 +250,11 @@ source](https://gitlab.com/gitlab-com/www-gitlab-com/blob/master/data/stages.yml

 ### Updating `config/feature_categories.yml`

-Occassionally new features will be added to GitLab stages. When this occurs, you
+Occasionally new features will be added to GitLab stages. When this occurs, you
 can automatically update `config/feature_categories.yml` by running
 `scripts/update-feature-categories`. This script will fetch and parse
 [`stages.yml`](https://gitlab.com/gitlab-com/www-gitlab-com/blob/master/data/stages.yml)
-and generare a new version of the file, which needs to be checked into source control.
+and generate a new version of the file, which needs to be checked into source control.

 ### Excluding Sidekiq workers from feature categorization

@@ -116,9 +278,63 @@ end
 Each Sidekiq worker must be tested using RSpec, just like any other class. These
 tests should be placed in `spec/workers`.

-## Removing or renaming queues
+## Sidekiq Compatibility across Updates
+
+Keep in mind that the arguments for a Sidekiq job are stored in a queue while it
+is scheduled for execution. During a online update, this could lead to several
+possible situations:
+
+1. An older version of the application publishes a job, which is executed by an
+   upgraded Sidekiq node.
+1. A job is queued before an upgrade, but executed after an upgrade.
+1. A job is queued by a node running the newer version of the application, but
+   executed on a node running an older version of the application.
+
+### Changing the arguments for a worker
+
+Jobs need to be backwards- and forwards-compatible between consecutive versions
+of the application.
+
+This can be done by following this process:
+
+1. **Do not remove arguments from the `perform` function.**. Instead, use the
+   following approach
+   1. Provide a default value (usually `nil`) and use a comment to mark the
+      argument as deprecated
+   1. Stop using the argument in `perform_async`.
+   1. Ignore the value in the worker class, but do not remove it until the next
+      major release.
+
+### Removing workers
+
+Try to avoid removing workers and their queues in minor and patch
+releases.

-Try to avoid renaming or removing workers and their queues in minor and patch releases.
 During online update instance can have pending jobs and removing the queue can
 lead to those jobs being stuck forever. If you can't write migration for those
-Sidekiq jobs, please consider doing rename or remove queue in major release only.
+Sidekiq jobs, please consider removing the worker in a major release only.
+
+### Renaming queues
+
+For the same reasons that removing workers is dangerous, care should be taken
+when renaming queues.
+
+When renaming queues, use the `sidekiq_queue_migrate` helper migration method,
+as show in this example:
+
+```ruby
+class MigrateTheRenamedSidekiqQueue < ActiveRecord::Migration[5.0]
+  include Gitlab::Database::MigrationHelpers
+
+  DOWNTIME = false
+
+  def up
+    sidekiq_queue_migrate 'old_queue_name', to: 'new_queue_name'
+  end
+
+  def down
+    sidekiq_queue_migrate 'new_queue_name', to: 'old_queue_name'
+  end
+end
+
+```
--- a/ee/app/workers/ci/create_cross_project_pipeline_worker.rb
+++ b/ee/app/workers/ci/create_cross_project_pipeline_worker.rb
@@ -5,6 +5,8 @@ module Ci
    include ::ApplicationWorker
    include ::PipelineQueue

+    worker_resource_boundary :cpu
+
    def perform(bridge_id)
      ::Ci::Bridge.find_by_id(bridge_id).try do |bridge|
        ::Ci::CreateCrossProjectPipelineService

--- a/ee/app/workers/ci/pipeline_bridge_status_worker.rb
+++ b/ee/app/workers/ci/pipeline_bridge_status_worker.rb
@@ -5,6 +5,9 @@ module Ci
    include ::ApplicationWorker
    include ::PipelineQueue

+    latency_sensitive_worker!
+    worker_resource_boundary :cpu
+
    def perform(pipeline_id)
      ::Ci::Pipeline.find_by_id(pipeline_id).try do |pipeline|
        ::Ci::PipelineBridgeStatusService

--- a/ee/app/workers/create_github_webhook_worker.rb
+++ b/ee/app/workers/create_github_webhook_worker.rb
@@ -5,6 +5,8 @@ class CreateGithubWebhookWorker
  include GrapePathHelpers::NamedRouteMatcher

  feature_category :integrations
+  worker_resource_boundary :cpu
+  worker_has_external_dependencies!

  attr_reader :project


--- a/ee/app/workers/export_csv_worker.rb
+++ b/ee/app/workers/export_csv_worker.rb
@@ -4,6 +4,7 @@ class ExportCsvWorker
  include ApplicationWorker

  feature_category :issue_tracking
+  worker_resource_boundary :memory

  def perform(current_user_id, project_id, params)
    @current_user = User.find(current_user_id)

--- a/ee/app/workers/incident_management/process_prometheus_alert_worker.rb
+++ b/ee/app/workers/incident_management/process_prometheus_alert_worker.rb
@@ -6,6 +6,7 @@ module IncidentManagement

    queue_namespace :incident_management
    feature_category :incident_management
+    worker_resource_boundary :cpu

    def perform(project_id, alert_hash)
      project = find_project(project_id)

--- a/ee/app/workers/ldap_all_groups_sync_worker.rb
+++ b/ee/app/workers/ldap_all_groups_sync_worker.rb
@@ -5,6 +5,7 @@ class LdapAllGroupsSyncWorker
  include CronjobQueue

  feature_category :authentication_and_authorization
+  worker_has_external_dependencies!

  def perform
    return unless Gitlab::Auth::LDAP::Config.group_sync_enabled?

--- a/ee/app/workers/ldap_group_sync_worker.rb
+++ b/ee/app/workers/ldap_group_sync_worker.rb
@@ -4,6 +4,7 @@ class LdapGroupSyncWorker
  include ApplicationWorker

  feature_category :authentication_and_authorization
+  worker_has_external_dependencies!

  # rubocop: disable CodeReuse/ActiveRecord
  def perform(group_ids, provider = nil)

--- a/ee/app/workers/ldap_sync_worker.rb
+++ b/ee/app/workers/ldap_sync_worker.rb
@@ -5,6 +5,7 @@ class LdapSyncWorker
  include CronjobQueue

  feature_category :authentication_and_authorization
+  worker_has_external_dependencies!

  # rubocop: disable CodeReuse/ActiveRecord
  # rubocop: disable Gitlab/RailsLogger

--- a/ee/app/workers/new_epic_worker.rb
+++ b/ee/app/workers/new_epic_worker.rb
@@ -5,6 +5,7 @@ class NewEpicWorker
  include NewIssuable

  feature_category :agile_portfolio_management
+  worker_resource_boundary :cpu

  def perform(epic_id, user_id)
    return unless objects_found?(epic_id, user_id)

--- a/ee/app/workers/sync_security_reports_to_report_approval_rules_worker.rb
+++ b/ee/app/workers/sync_security_reports_to_report_approval_rules_worker.rb
@@ -7,6 +7,8 @@ class SyncSecurityReportsToReportApprovalRulesWorker
  include PipelineQueue

  feature_category :static_application_security_testing
+  latency_sensitive_worker!
+  worker_resource_boundary :cpu

  def perform(pipeline_id)
    pipeline = Ci::Pipeline.find_by_id(pipeline_id)

--- a/ee/app/workers/update_max_seats_used_for_gitlab_com_subscriptions_worker.rb
+++ b/ee/app/workers/update_max_seats_used_for_gitlab_com_subscriptions_worker.rb
@@ -5,6 +5,7 @@ class UpdateMaxSeatsUsedForGitlabComSubscriptionsWorker
  include CronjobQueue

  feature_category :license_compliance
+  worker_resource_boundary :cpu

  # rubocop: disable CodeReuse/ActiveRecord
  def perform

--- a/spec/workers/every_sidekiq_worker_spec.rb
+++ b/spec/workers/every_sidekiq_worker_spec.rb
@@ -21,8 +21,8 @@ describe 'Every Sidekiq worker' do
    missing_from_file = worker_queues - file_worker_queues
    expect(missing_from_file).to be_empty, "expected #{missing_from_file.to_a.inspect} to be in Gitlab::SidekiqConfig::QUEUE_CONFIG_PATHS"

-    unncessarily_in_file = file_worker_queues - worker_queues
-    expect(unncessarily_in_file).to be_empty, "expected #{unncessarily_in_file.to_a.inspect} not to be in Gitlab::SidekiqConfig::QUEUE_CONFIG_PATHS"
+    unnecessarily_in_file = file_worker_queues - worker_queues
+    expect(unnecessarily_in_file).to be_empty, "expected #{unnecessarily_in_file.to_a.inspect} not to be in Gitlab::SidekiqConfig::QUEUE_CONFIG_PATHS"
  end

  it 'has its queue or namespace in config/sidekiq_queues.yml', :aggregate_failures do
@@ -42,7 +42,7 @@ describe 'Every Sidekiq worker' do
    end

    # All Sidekiq worker classes should declare a valid `feature_category`
-    # or explicitely be excluded with the `feature_category_not_owned!` annotation.
+    # or explicitly be excluded with the `feature_category_not_owned!` annotation.
    # Please see doc/development/sidekiq_style_guide.md#Feature-Categorization for more details.
    it 'has a feature_category or feature_category_not_owned! attribute', :aggregate_failures do
      Gitlab::SidekiqConfig.workers.each do |worker|
@@ -62,5 +62,36 @@ describe 'Every Sidekiq worker' do
        expect(feature_categories).to include(worker.get_feature_category), "expected #{worker.inspect} to declare a valid feature_category, but got #{worker.get_feature_category}"
      end
    end
+
+    # Memory-bound workers are very expensive to run, since they need to run on nodes with very low
+    # concurrency, so that each job can consume a large amounts of memory. For this reason, on
+    # GitLab.com, when a large number of memory-bound jobs arrive at once, we let them queue up
+    # rather than scaling the hardware to meet the SLO. For this reason, memory-bound,
+    # latency-sensitive jobs are explicitly discouraged and disabled.
+    it 'is (exclusively) memory-bound or latency-sentitive, not both', :aggregate_failures do
+      latency_sensitive_workers = Gitlab::SidekiqConfig.workers
+                  .select(&:latency_sensitive_worker?)
+
+      latency_sensitive_workers.each do |worker|
+        expect(worker.get_worker_resource_boundary).not_to eq(:memory), "#{worker.inspect} cannot be both memory-bound and latency sensitive"
+      end
+    end
+
+    # In high traffic installations, such as GitLab.com, `latency_sensitive` workers run in a
+    # dedicated fleet. In order to ensure short queue times, `latency_sensitive` jobs have strict
+    # SLOs in order to ensure throughput. However, when a worker depends on an external service,
+    # such as a user's k8s cluster or a third-party internet service, we cannot guarantee latency,
+    # and therefore throughput. An outage to an 3rd party service could therefore impact throughput
+    # on other latency_sensitive jobs, leading to degradation through the GitLab application.
+    # Please see doc/development/sidekiq_style_guide.md#Jobs-with-External-Dependencies for more
+    # details.
+    it 'has (exclusively) external dependencies or is latency-sentitive, not both', :aggregate_failures do
+      latency_sensitive_workers = Gitlab::SidekiqConfig.workers
+                  .select(&:latency_sensitive_worker?)
+
+      latency_sensitive_workers.each do |worker|
+        expect(worker.worker_has_external_dependencies?).to be_falsey, "#{worker.inspect} cannot have both external dependencies and be latency sensitive"
+      end
+    end
  end
 end