Commit 1987ad53 authored by Stan Hu's avatar Stan Hu Committed by Nick Thomas

Add support for logging Prometheus metrics for Geo

parent 715ea16f
...@@ -58,8 +58,8 @@ class GeoNodeStatus { ...@@ -58,8 +58,8 @@ class GeoNodeStatus {
this.setHealthStatus(status.healthy); this.setHealthStatus(status.healthy);
// Replication lag can be nil if the secondary isn't actually streaming // Replication lag can be nil if the secondary isn't actually streaming
if (status.db_replication_lag) { if (status.db_replication_lag_seconds !== null && status.db_replication_lag_seconds >= 0) {
const parsedTime = parseSeconds(status.db_replication_lag, { const parsedTime = parseSeconds(status.db_replication_lag_seconds, {
hoursPerDay: 24, hoursPerDay: 24,
daysPerWeek: 7, daysPerWeek: 7,
}); });
...@@ -96,8 +96,17 @@ class GeoNodeStatus { ...@@ -96,8 +96,17 @@ class GeoNodeStatus {
this.$attachmentsSynced.text(attachmentText); this.$attachmentsSynced.text(attachmentText);
this.$attachmentsFailed.text(attachmentFailedText); this.$attachmentsFailed.text(attachmentFailedText);
const eventDate = gl.utils.formatDate(new Date(status.last_event_date)); let eventDate = 'N/A';
const cursorDate = gl.utils.formatDate(new Date(status.cursor_last_event_date)); let cursorDate = 'N/A';
if (status.last_event_timestamp !== null) {
eventDate = gl.utils.formatDate(new Date(status.last_event_timestamp * 1000));
}
if (status.cursor_last_event_timestamp !== null) {
cursorDate = gl.utils.formatDate(new Date(status.cursor_last_event_timestamp * 1000));
}
this.$lastEventSeen.text(`${status.last_event_id} (${eventDate})`); this.$lastEventSeen.text(`${status.last_event_id} (${eventDate})`);
this.$lastCursorEvent.text(`${status.cursor_last_event_id} (${cursorDate})`); this.$lastCursorEvent.text(`${status.cursor_last_event_id} (${cursorDate})`);
if (status.health === 'Healthy') { if (status.health === 'Healthy') {
......
class GeoNodeStatus class GeoNodeStatus
include ActiveModel::Model include ActiveModel::Model
attr_accessor :id attr_accessor :id, :success
attr_writer :health attr_writer :health
def health def health
...@@ -14,14 +14,14 @@ class GeoNodeStatus ...@@ -14,14 +14,14 @@ class GeoNodeStatus
health.blank? health.blank?
end end
def db_replication_lag def db_replication_lag_seconds
return @db_replication_lag if defined?(@db_replication_lag) return @db_replication_lag_seconds if defined?(@db_replication_lag_seconds)
@db_replication_lag = Gitlab::Geo::HealthCheck.db_replication_lag if Gitlab::Geo.secondary? @db_replication_lag_seconds = Gitlab::Geo::HealthCheck.db_replication_lag_seconds if Gitlab::Geo.secondary?
end end
def db_replication_lag=(value) def db_replication_lag_seconds=(value)
@db_replication_lag = value @db_replication_lag_seconds = value
end end
def last_event_id def last_event_id
...@@ -32,12 +32,12 @@ class GeoNodeStatus ...@@ -32,12 +32,12 @@ class GeoNodeStatus
@last_event_id = value @last_event_id = value
end end
def last_event_date def last_event_timestamp
@last_event_date ||= Geo::EventLog.latest_event&.created_at @last_event_timestamp ||= Geo::EventLog.latest_event&.created_at&.to_i
end end
def last_event_date=(value) def last_event_timestamp=(value)
@last_event_date = value @last_event_timestamp = value
end end
def cursor_last_event_id def cursor_last_event_id
...@@ -50,16 +50,16 @@ class GeoNodeStatus ...@@ -50,16 +50,16 @@ class GeoNodeStatus
@cursor_last_event_id = value @cursor_last_event_id = value
end end
def cursor_last_event_date def cursor_last_event_timestamp
event_id = cursor_last_event_id event_id = cursor_last_event_id
return unless event_id return unless event_id
@cursor_last_event_date ||= Geo::EventLog.find_by(id: event_id)&.created_at @cursor_last_event_timestamp ||= Geo::EventLog.find_by(id: event_id)&.created_at&.to_i
end end
def cursor_last_event_date=(value) def cursor_last_event_timestamp=(value)
@cursor_last_event_date = value @cursor_last_event_timestamp = value
end end
def repositories_count def repositories_count
...@@ -159,6 +159,10 @@ class GeoNodeStatus ...@@ -159,6 +159,10 @@ class GeoNodeStatus
sync_percentage(attachments_count, attachments_synced_count) sync_percentage(attachments_count, attachments_synced_count)
end end
def [](key)
public_send(key) # rubocop:disable GitlabSecurity/PublicSend
end
private private
def sync_percentage(total, synced) def sync_percentage(total, synced)
......
...@@ -15,7 +15,7 @@ class GeoNodeStatusEntity < Grape::Entity ...@@ -15,7 +15,7 @@ class GeoNodeStatusEntity < Grape::Entity
number_to_percentage(node.attachments_synced_in_percentage, precision: 2) number_to_percentage(node.attachments_synced_in_percentage, precision: 2)
end end
expose :db_replication_lag expose :db_replication_lag_seconds
expose :lfs_objects_count expose :lfs_objects_count
expose :lfs_objects_synced_count expose :lfs_objects_synced_count
...@@ -32,7 +32,7 @@ class GeoNodeStatusEntity < Grape::Entity ...@@ -32,7 +32,7 @@ class GeoNodeStatusEntity < Grape::Entity
end end
expose :last_event_id expose :last_event_id
expose :last_event_date expose :last_event_timestamp
expose :cursor_last_event_id expose :cursor_last_event_id
expose :cursor_last_event_date expose :cursor_last_event_timestamp
end end
module Geo
class MetricsUpdateService
METRIC_PREFIX = 'geo_'.freeze
def execute
return unless Gitlab::Geo.enabled?
if Gitlab::Geo.primary?
fetch_secondary_geo_nodes_metrics
else
fetch_current_geo_node_metrics
end
end
private
def fetch_secondary_geo_nodes_metrics
Gitlab::Geo.secondary_nodes.find_each { |node| fetch_geo_node_metrics(node) }
end
def fetch_current_geo_node_metrics
fetch_geo_node_metrics(Gitlab::Geo.current_node)
end
def fetch_geo_node_metrics(node)
status = node_status(node)
unless status.success
increment_failed_status_counter(node)
return
end
NodeStatusService::STATUS_DATA.each do |key, docstring|
value = status[key]
next unless value.is_a?(Integer)
gauge = Gitlab::Metrics.gauge(gauge_metric_name(key), docstring, {}, :max)
gauge.set(metric_labels(node), value)
end
set_last_updated_at(node)
end
def node_status(node)
NodeStatusService.new.call(node)
end
def set_last_updated_at(node)
gauge = Gitlab::Metrics.gauge(
:geo_status_last_updated_timestamp,
'UNIX timestamp of last time Geo node status was updated internally',
{},
:max)
gauge.set(metric_labels(node), Time.now.to_i)
end
def increment_failed_status_counter(node)
failed_status_counter(node).increment
end
def failed_status_counter(node)
Gitlab::Metrics.counter(
:geo_status_failed_total,
'Total number of times status for Geo node failed to retrieve',
metric_labels(node))
end
def gauge_metric_name(name)
# Prometheus naming conventions in
# https://prometheus.io/docs/instrumenting/writing_exporters/#naming says
# that _count and _total should be reserved for counters
base_name = name.to_s.gsub(/(_count|_total)$/, '')
(METRIC_PREFIX + base_name).to_sym
end
def metric_labels(node)
{ url: node.url }
end
end
end
...@@ -3,31 +3,33 @@ module Geo ...@@ -3,31 +3,33 @@ module Geo
include Gitlab::CurrentSettings include Gitlab::CurrentSettings
include HTTParty include HTTParty
KEYS = %w( STATUS_DATA = {
health health: 'Summary of health status',
db_replication_lag db_replication_lag_seconds: 'Database replication lag (seconds)',
repositories_count repositories_count: 'Total number of repositories available on primary',
repositories_synced_count repositories_synced_count: 'Number of repositories synced on secondary',
repositories_failed_count repositories_failed_count: 'Number of repositories failed to sync on secondary',
lfs_objects_count lfs_objects_count: 'Total number of LFS objects available on primary',
lfs_objects_synced_count lfs_objects_synced_count: 'Number of LFS objects synced on secondary',
lfs_objects_failed_count lfs_objects_failed_count: 'Number of LFS objects failed to sync on secondary',
attachments_count attachments_count: 'Total number of file attachments available on primary',
attachments_synced_count attachments_synced_count: 'Number of attachments synced on secondary',
attachments_failed_count attachments_failed_count: 'Number of attachments failed to sync on secondary',
last_event_id last_event_id: 'Database ID of the latest event log entry on the primary',
last_event_date last_event_timestamp: 'UNIX timestamp of the latest event log entry on the primary',
cursor_last_event_id cursor_last_event_id: 'Last database ID of the event log processed by the secondary',
cursor_last_event_date cursor_last_event_timestamp: 'Last UNIX timestamp of the event log processed by the secondary'
).freeze }.freeze
def call(geo_node) def call(geo_node)
values = data = { id: geo_node.id }
begin begin
response = self.class.get(geo_node.status_url, headers: headers, timeout: timeout) response = self.class.get(geo_node.status_url, headers: headers, timeout: timeout)
data[:success] = response.success?
if response.success? if response.success?
response.parsed_response.values_at(*KEYS) data.merge!(response.parsed_response.symbolize_keys.slice(*STATUS_DATA.keys))
else else
message = "Could not connect to Geo node - HTTP Status Code: #{response.code} #{response.message}" message = "Could not connect to Geo node - HTTP Status Code: #{response.code} #{response.message}"
payload = response.parsed_response payload = response.parsed_response
...@@ -39,17 +41,21 @@ module Geo ...@@ -39,17 +41,21 @@ module Geo
'' ''
end end
Array([message, details].compact.join("\n")) data[:health] = [message, details].compact.join("\n")
end end
rescue Gitlab::Geo::GeoNodeNotFoundError rescue Gitlab::Geo::GeoNodeNotFoundError
['This GitLab instance does not appear to be configured properly as a Geo node. Make sure the URLs are using the correct fully-qualified domain names.'] data[:health] = 'This GitLab instance does not appear to be configured properly as a Geo node. Make sure the URLs are using the correct fully-qualified domain names.'
rescue OpenSSL::Cipher::CipherError rescue OpenSSL::Cipher::CipherError
['Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.'] data[:health] = 'Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.'
rescue HTTParty::Error, Timeout::Error, SocketError, SystemCallError, OpenSSL::SSL::SSLError => e rescue HTTParty::Error, Timeout::Error, SocketError, SystemCallError, OpenSSL::SSL::SSLError => e
[e.message] data[:health] = e.message
end
GeoNodeStatus.new(data)
end end
GeoNodeStatus.new(KEYS.zip(values).to_h.merge(id: geo_node.id)) def status_keys
STATUS_DATA.stringify_keys.keys
end end
private private
......
module ExclusiveLeaseGuard
extend ActiveSupport::Concern
def lease_key
@lease_key ||= self.class.name.underscore
end
def try_obtain_lease
lease = exclusive_lease.try_obtain
unless lease
log_error('Cannot obtain an exclusive lease. There must be another worker already in execution.')
return
end
begin
yield lease
ensure
release_lease(lease)
end
end
def exclusive_lease
@lease ||= Gitlab::ExclusiveLease.new(lease_key, timeout: lease_timeout)
end
def release_lease(uuid)
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
end
end
module Geo
class MetricsUpdateWorker
include Sidekiq::Worker
include ExclusiveLeaseGuard
include CronjobQueue
LEASE_TIMEOUT = 5.minutes
def perform
return unless Gitlab::Metrics.prometheus_metrics_enabled?
try_obtain_lease { Geo::MetricsUpdateService.new.execute }
end
def lease_timeout
LEASE_TIMEOUT
end
end
end
---
title: Add support for logging Prometheus metrics for Geo
merge_request: !3187
author:
type: added
...@@ -241,13 +241,18 @@ production: &base ...@@ -241,13 +241,18 @@ production: &base
ldap_sync_worker: ldap_sync_worker:
cron: "30 1 * * *" cron: "30 1 * * *"
# GitLab Geo repository sync worker # GitLab Geo metrics update worker
# NOTE: This will only take effect if Geo is enabled # NOTE: This will only take effect if Geo is enabled
geo_metrics_update_worker:
cron: "*/1 * * * *"
# GitLab Geo repository sync worker
# NOTE: This will only take effect if Geo is enabled (secondary nodes only)
geo_repository_sync_worker: geo_repository_sync_worker:
cron: "*/5 * * * *" cron: "*/5 * * * *"
# GitLab Geo file download dispatch worker # GitLab Geo file download dispatch worker
# NOTE: This will only take effect if Geo is enabled # NOTE: This will only take effect if Geo is enabled (secondary nodes only)
geo_file_download_dispatch_worker: geo_file_download_dispatch_worker:
cron: "*/10 * * * *" cron: "*/10 * * * *"
......
...@@ -443,6 +443,9 @@ Settings.cron_jobs['ldap_sync_worker']['job_class'] = 'LdapSyncWorker' ...@@ -443,6 +443,9 @@ Settings.cron_jobs['ldap_sync_worker']['job_class'] = 'LdapSyncWorker'
Settings.cron_jobs['ldap_group_sync_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['ldap_group_sync_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['ldap_group_sync_worker']['cron'] ||= '0 * * * *' Settings.cron_jobs['ldap_group_sync_worker']['cron'] ||= '0 * * * *'
Settings.cron_jobs['ldap_group_sync_worker']['job_class'] = 'LdapAllGroupsSyncWorker' Settings.cron_jobs['ldap_group_sync_worker']['job_class'] = 'LdapAllGroupsSyncWorker'
Settings.cron_jobs['geo_metrics_update_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['geo_metrics_update_worker']['cron'] ||= '*/1 * * * *'
Settings.cron_jobs['geo_metrics_update_worker']['job_class'] ||= 'Geo::MetricsUpdateWorker'
Settings.cron_jobs['geo_repository_sync_worker'] ||= Settingslogic.new({}) Settings.cron_jobs['geo_repository_sync_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['geo_repository_sync_worker']['cron'] ||= '*/1 * * * *' Settings.cron_jobs['geo_repository_sync_worker']['cron'] ||= '*/1 * * * *'
Settings.cron_jobs['geo_repository_sync_worker']['job_class'] ||= 'Geo::RepositorySyncWorker' Settings.cron_jobs['geo_repository_sync_worker']['job_class'] ||= 'Geo::RepositorySyncWorker'
......
...@@ -22,7 +22,7 @@ collect metrics from this endpoint. We recommend setting up another Prometheus ...@@ -22,7 +22,7 @@ collect metrics from this endpoint. We recommend setting up another Prometheus
server, because the embedded server configuration is overwritten once every server, because the embedded server configuration is overwritten once every
[reconfigure of GitLab][reconfigure]. In the future this will not be required. [reconfigure of GitLab][reconfigure]. In the future this will not be required.
## Metrics available ## Unicorn Metrics available
In this experimental phase, only a few metrics are available: In this experimental phase, only a few metrics are available:
...@@ -48,6 +48,30 @@ In this experimental phase, only a few metrics are available: ...@@ -48,6 +48,30 @@ In this experimental phase, only a few metrics are available:
| filesystem_circuitbreaker_latency_seconds | Histogram | 9.5 | Latency of the stat check the circuitbreaker uses to probe a shard | | filesystem_circuitbreaker_latency_seconds | Histogram | 9.5 | Latency of the stat check the circuitbreaker uses to probe a shard |
| filesystem_circuitbreaker | Gauge | 9.5 | Wether or not the circuit for a certain shard is broken or not | | filesystem_circuitbreaker | Gauge | 9.5 | Wether or not the circuit for a certain shard is broken or not |
## Sidekiq Metrics available
Sidekiq jobs may also gather metrics, and these metrics can be accessed if the Sidekiq exporter is enabled (e.g. via
the `monitoring.sidekiq_exporter` configuration option in `gitlab.yml`.
| Metric | Type | Since | Description | Labels |
|:--------------------------------- |:--------- |:----- |:----------- |:------ |
|geo_db_replication_lag_seconds | Gauge | 10.2 | Database replication lag (seconds) | url
|geo_repositories | Gauge | 10.2 | Total number of repositories available on primary | url
|geo_repositories_synced | Gauge | 10.2 | Number of repositories synced on secondary | url
|geo_repositories_failed | Gauge | 10.2 | Number of repositories failed to sync on secondary | url
|geo_lfs_objects | Gauge | 10.2 | Total number of LFS objects available on primary | url
|geo_lfs_objects_synced | Gauge | 10.2 | Number of LFS objects synced on secondary | url
|geo_lfs_objects_failed | Gauge | 10.2 | Number of LFS objects failed to sync on secondary | url
|geo_attachments | Gauge | 10.2 | Total number of file attachments available on primary | url
|geo_attachments_synced | Gauge | 10.2 | Number of attachments synced on secondary | url
|geo_attachments_failed | Gauge | 10.2 | Number of attachments failed to sync on secondary | url
|geo_last_event_id | Gauge | 10.2 | Database ID of the latest event log entry on the primary | url
|geo_last_event_timestamp | Gauge | 10.2 | UNIX timestamp of the latest event log entry on the primary | url
|geo_cursor_last_event_id | Gauge | 10.2 | Last database ID of the event log processed by the secondary | url
|geo_cursor_last_event_timestamp | Gauge | 10.2 | Last UNIX timestamp of the event log processed by the secondary | url
|geo_status_last_updated_timestamp | Gauge | 10.2 | Last timestamp when the status was successfully updated | url
|geo_status_failed_total | Counter | 10.2 | Number of times retrieving the status from the Geo Node failed | url
## Metrics shared directory ## Metrics shared directory
GitLab's Prometheus client requires a directory to store metrics data shared between multi-process services. GitLab's Prometheus client requires a directory to store metrics data shared between multi-process services.
......
...@@ -1010,7 +1010,7 @@ module API ...@@ -1010,7 +1010,7 @@ module API
class GeoNodeStatus < Grape::Entity class GeoNodeStatus < Grape::Entity
expose :id expose :id
expose :db_replication_lag expose :db_replication_lag_seconds
expose :health expose :health
expose :healthy?, as: :healthy expose :healthy?, as: :healthy
expose :repositories_count expose :repositories_count
...@@ -1023,9 +1023,9 @@ module API ...@@ -1023,9 +1023,9 @@ module API
expose :attachments_synced_count expose :attachments_synced_count
expose :attachments_failed_count expose :attachments_failed_count
expose :last_event_id expose :last_event_id
expose :last_event_date expose :last_event_timestamp
expose :cursor_last_event_id expose :cursor_last_event_id
expose :cursor_last_event_date expose :cursor_last_event_timestamp
end end
class PersonalAccessToken < Grape::Entity class PersonalAccessToken < Grape::Entity
......
...@@ -12,6 +12,7 @@ module Gitlab ...@@ -12,6 +12,7 @@ module Gitlab
geo_oauth_application geo_oauth_application
).freeze ).freeze
COMMON_JOBS = %i(metrics_update_job).freeze
SECONDARY_JOBS = %i(repository_sync_job file_download_job).freeze SECONDARY_JOBS = %i(repository_sync_job file_download_job).freeze
FDW_SCHEMA = 'gitlab_secondary'.freeze FDW_SCHEMA = 'gitlab_secondary'.freeze
...@@ -95,6 +96,10 @@ module Gitlab ...@@ -95,6 +96,10 @@ module Gitlab
Sidekiq::Cron::Job.find('geo_file_download_dispatch_worker') Sidekiq::Cron::Job.find('geo_file_download_dispatch_worker')
end end
def self.metrics_update_job
Sidekiq::Cron::Job.find('geo_metrics_update_worker')
end
def self.configure_primary_jobs! def self.configure_primary_jobs!
self.enable_all_cron_jobs! self.enable_all_cron_jobs!
SECONDARY_JOBS.each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend SECONDARY_JOBS.each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend
...@@ -102,11 +107,11 @@ module Gitlab ...@@ -102,11 +107,11 @@ module Gitlab
def self.configure_secondary_jobs! def self.configure_secondary_jobs!
self.disable_all_cron_jobs! self.disable_all_cron_jobs!
SECONDARY_JOBS.each { |job| self.__send__(job).try(:enable!) } # rubocop:disable GitlabSecurity/PublicSend (COMMON_JOBS + SECONDARY_JOBS).each { |job| self.__send__(job).try(:enable!) } # rubocop:disable GitlabSecurity/PublicSend
end end
def self.disable_all_geo_jobs! def self.disable_all_geo_jobs!
SECONDARY_JOBS.each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend (COMMON_JOBS + SECONDARY_JOBS).each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend
end end
def self.disable_all_cron_jobs! def self.disable_all_cron_jobs!
......
...@@ -7,7 +7,7 @@ module Gitlab ...@@ -7,7 +7,7 @@ module Gitlab
return '' unless Gitlab::Geo.secondary? return '' unless Gitlab::Geo.secondary?
return 'The Geo database configuration file is missing.' unless Gitlab::Geo.geo_database_configured? return 'The Geo database configuration file is missing.' unless Gitlab::Geo.geo_database_configured?
return 'The Geo node has a database that is not configured for streaming replication with the primary node.' unless self.database_secondary? return 'The Geo node has a database that is not configured for streaming replication with the primary node.' unless self.database_secondary?
return 'The Geo node does not appear to be replicating data from the primary node.' unless self.db_replication_lag.present? return 'The Geo node does not appear to be replicating data from the primary node.' unless self.db_replication_lag_seconds.present?
database_version = self.get_database_version.to_i database_version = self.get_database_version.to_i
migration_version = self.get_migration_version.to_i migration_version = self.get_migration_version.to_i
...@@ -60,8 +60,9 @@ module Gitlab ...@@ -60,8 +60,9 @@ module Gitlab
.fetch('pg_is_in_recovery') == 't' .fetch('pg_is_in_recovery') == 't'
end end
def self.db_replication_lag def self.db_replication_lag_seconds
# Obtain the replication lag in seconds # Obtain the replication lag in seconds
lag =
ActiveRecord::Base.connection.execute(' ActiveRecord::Base.connection.execute('
SELECT CASE SELECT CASE
WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location()
...@@ -72,6 +73,8 @@ module Gitlab ...@@ -72,6 +73,8 @@ module Gitlab
AS replication_lag') AS replication_lag')
.first .first
.fetch('replication_lag') .fetch('replication_lag')
lag.present? ? lag.to_i : lag
end end
end end
end end
......
...@@ -273,9 +273,9 @@ describe Admin::GeoNodesController, :postgresql do ...@@ -273,9 +273,9 @@ describe Admin::GeoNodesController, :postgresql do
repositories_synced_count: 5, repositories_synced_count: 5,
repositories_failed_count: 0, repositories_failed_count: 0,
last_event_id: 2, last_event_id: 2,
last_event_date: Time.now.iso8601, last_event_timestamp: Time.now.to_i,
cursor_last_event_id: 1, cursor_last_event_id: 1,
cursor_last_event_date: Time.now.iso8601 cursor_last_event_timestamp: Time.now.to_i
) )
end end
......
...@@ -10,14 +10,14 @@ ...@@ -10,14 +10,14 @@
"lfs_objects_count", "lfs_objects_count",
"lfs_objects_failed_count", "lfs_objects_failed_count",
"lfs_objects_synced_count", "lfs_objects_synced_count",
"db_replication_lag", "db_replication_lag_seconds",
"repositories_count", "repositories_count",
"repositories_failed_count", "repositories_failed_count",
"repositories_synced_count", "repositories_synced_count",
"last_event_id", "last_event_id",
"last_event_date", "last_event_timestamp",
"cursor_last_event_id", "cursor_last_event_id",
"cursor_last_event_date" "cursor_last_event_timestamp"
], ],
"properties" : { "properties" : {
"id": { "type": "integer" }, "id": { "type": "integer" },
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
"attachments_failed_count": { "type": "integer" }, "attachments_failed_count": { "type": "integer" },
"attachments_synced_count": { "type": "integer" }, "attachments_synced_count": { "type": "integer" },
"attachments_synced_in_percentage": { "type": "string" }, "attachments_synced_in_percentage": { "type": "string" },
"db_replication_lag": { "type": ["integer", "null"] }, "db_replication_lag_seconds": { "type": ["integer", "null"] },
"lfs_objects_count": { "type": "integer" }, "lfs_objects_count": { "type": "integer" },
"lfs_objects_failed_count": { "type": "integer" }, "lfs_objects_failed_count": { "type": "integer" },
"lfs_objects_synced_count": { "type": "integer" }, "lfs_objects_synced_count": { "type": "integer" },
...@@ -37,9 +37,9 @@ ...@@ -37,9 +37,9 @@
"repositories_synced_count": { "type": "integer" }, "repositories_synced_count": { "type": "integer" },
"repositories_synced_in_percentage": { "type": "string" }, "repositories_synced_in_percentage": { "type": "string" },
"last_event_id": { "type": ["integer", "null"] }, "last_event_id": { "type": ["integer", "null"] },
"last_event_date": { "type": ["string", "null"] }, "last_event_timestamp": { "type": ["integer", "null"] },
"cursor_last_event_id": { "type": ["integer", "null"] }, "cursor_last_event_id": { "type": ["integer", "null"] },
"cursor_last_event_date": { "type": ["string", "null"] } "cursor_last_event_timestamp": { "type": ["integer", "null"] }
}, },
"additionalProperties": false "additionalProperties": false
} }
...@@ -15,7 +15,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -15,7 +15,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(described_class).to receive(:get_database_version).and_return('20170101') allow(described_class).to receive(:get_database_version).and_return('20170101')
allow(described_class).to receive(:get_migration_version).and_return('20170201') allow(described_class).to receive(:get_migration_version).and_return('20170201')
allow(described_class).to receive(:db_replication_lag).and_return(0) allow(described_class).to receive(:db_replication_lag_seconds).and_return(0)
message = subject.perform_checks message = subject.perform_checks
...@@ -54,7 +54,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -54,7 +54,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when Geo database version does not match the latest migration version' do it 'returns an error when Geo database version does not match the latest migration version' do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(subject).to receive(:get_database_version) { 1 } allow(subject).to receive(:get_database_version) { 1 }
allow(described_class).to receive(:db_replication_lag).and_return(0) allow(described_class).to receive(:db_replication_lag_seconds).and_return(0)
expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/) expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/)
end end
...@@ -62,14 +62,14 @@ describe Gitlab::Geo::HealthCheck, :postgresql do ...@@ -62,14 +62,14 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when latest migration version does not match the Geo database version' do it 'returns an error when latest migration version does not match the Geo database version' do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(subject).to receive(:get_migration_version) { 1 } allow(subject).to receive(:get_migration_version) { 1 }
allow(described_class).to receive(:db_replication_lag).and_return(0) allow(described_class).to receive(:db_replication_lag_seconds).and_return(0)
expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/) expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/)
end end
it 'returns an error when replication lag is not present' do it 'returns an error when replication lag is not present' do
allow(described_class).to receive(:database_secondary?).and_return(true) allow(described_class).to receive(:database_secondary?).and_return(true)
allow(described_class).to receive(:db_replication_lag).and_return(nil) allow(described_class).to receive(:db_replication_lag_seconds).and_return(nil)
expect(subject.perform_checks).to match(/The Geo node does not appear to be replicating data from the primary node/) expect(subject.perform_checks).to match(/The Geo node does not appear to be replicating data from the primary node/)
end end
......
...@@ -186,7 +186,7 @@ describe Gitlab::Geo, :geo do ...@@ -186,7 +186,7 @@ describe Gitlab::Geo, :geo do
end end
describe '.configure_cron_jobs!' do describe '.configure_cron_jobs!' do
JOBS = %w(ldap_test geo_repository_sync_worker geo_file_download_dispatch_worker).freeze JOBS = %w(ldap_test geo_repository_sync_worker geo_file_download_dispatch_worker geo_metrics_update_worker).freeze
def init_cron_job(job_name, class_name) def init_cron_job(job_name, class_name)
job = Sidekiq::Cron::Job.new( job = Sidekiq::Cron::Job.new(
...@@ -211,6 +211,7 @@ describe Gitlab::Geo, :geo do ...@@ -211,6 +211,7 @@ describe Gitlab::Geo, :geo do
expect(described_class.repository_sync_job).not_to be_enabled expect(described_class.repository_sync_job).not_to be_enabled
expect(described_class.file_download_job).not_to be_enabled expect(described_class.file_download_job).not_to be_enabled
expect(described_class.metrics_update_job).to be_enabled
expect(Sidekiq::Cron::Job.find('ldap_test')).to be_enabled expect(Sidekiq::Cron::Job.find('ldap_test')).to be_enabled
end end
...@@ -222,15 +223,17 @@ describe Gitlab::Geo, :geo do ...@@ -222,15 +223,17 @@ describe Gitlab::Geo, :geo do
expect(Sidekiq::Cron::Job.find('ldap_test')).not_to be_enabled expect(Sidekiq::Cron::Job.find('ldap_test')).not_to be_enabled
expect(described_class.repository_sync_job).to be_enabled expect(described_class.repository_sync_job).to be_enabled
expect(described_class.file_download_job).to be_enabled expect(described_class.file_download_job).to be_enabled
expect(described_class.metrics_update_job).to be_enabled
end end
it 'deactivates all jobs when Geo is not active' do it 'deactivates all jobs when Geo is not active' do
GeoNode.update_all(enabled: false) stub_current_geo_node(nil)
described_class.configure_cron_jobs! described_class.configure_cron_jobs!
expect(described_class.repository_sync_job).not_to be_enabled expect(described_class.repository_sync_job).not_to be_enabled
expect(described_class.file_download_job).not_to be_enabled expect(described_class.file_download_job).not_to be_enabled
expect(described_class.metrics_update_job).not_to be_enabled
expect(Sidekiq::Cron::Job.find('ldap_test')).to be_enabled expect(Sidekiq::Cron::Job.find('ldap_test')).to be_enabled
end end
......
...@@ -124,18 +124,18 @@ describe GeoNodeStatus do ...@@ -124,18 +124,18 @@ describe GeoNodeStatus do
end end
end end
describe '#db_replication_lag' do describe '#db_replication_lag_seconds' do
it 'returns the set replication lag if secondary' do it 'returns the set replication lag if secondary' do
allow(Gitlab::Geo).to receive(:secondary?).and_return(true) allow(Gitlab::Geo).to receive(:secondary?).and_return(true)
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag).and_return(1000) allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag_seconds).and_return(1000)
expect(subject.db_replication_lag).to eq(1000) expect(subject.db_replication_lag_seconds).to eq(1000)
end end
it "doesn't attempt to set replication lag if primary" do it "doesn't attempt to set replication lag if primary" do
expect(Gitlab::Geo::HealthCheck).not_to receive(:db_replication_lag) expect(Gitlab::Geo::HealthCheck).not_to receive(:db_replication_lag_seconds)
expect(subject.db_replication_lag).to eq(nil) expect(subject.db_replication_lag_seconds).to eq(nil)
end end
end end
...@@ -217,25 +217,25 @@ describe GeoNodeStatus do ...@@ -217,25 +217,25 @@ describe GeoNodeStatus do
end end
end end
describe '#last_event_id and #last_event_date' do describe '#last_event_id and #last_event_timestamp' do
it 'returns nil when no events are available' do it 'returns nil when no events are available' do
expect(subject.last_event_id).to be_nil expect(subject.last_event_id).to be_nil
expect(subject.last_event_date).to be_nil expect(subject.last_event_timestamp).to be_nil
end end
it 'returns the latest event' do it 'returns the latest event' do
created_at = Date.new(2017, 10, 22) created_at = Date.today.to_time(:utc)
event = create(:geo_event_log, created_at: created_at) event = create(:geo_event_log, created_at: created_at)
expect(subject.last_event_id).to eq(event.id) expect(subject.last_event_id).to eq(event.id)
expect(subject.last_event_date).to eq(created_at) expect(subject.last_event_timestamp).to eq(created_at.to_i)
end end
end end
describe '#cursor_last_event_id and #cursor_last_event_date' do describe '#cursor_last_event_id and #cursor_last_event_timestamp' do
it 'returns nil when no events are available' do it 'returns nil when no events are available' do
expect(subject.cursor_last_event_id).to be_nil expect(subject.cursor_last_event_id).to be_nil
expect(subject.cursor_last_event_date).to be_nil expect(subject.cursor_last_event_timestamp).to be_nil
end end
it 'returns the latest event ID if secondary' do it 'returns the latest event ID if secondary' do
...@@ -248,14 +248,25 @@ describe GeoNodeStatus do ...@@ -248,14 +248,25 @@ describe GeoNodeStatus do
it "doesn't attempt to retrieve cursor if primary" do it "doesn't attempt to retrieve cursor if primary" do
create(:geo_event_log_state) create(:geo_event_log_state)
expect(subject.cursor_last_event_date).to eq(nil) expect(subject.cursor_last_event_timestamp).to eq(nil)
expect(subject.cursor_last_event_id).to eq(nil) expect(subject.cursor_last_event_id).to eq(nil)
end end
end end
describe '#[]' do
it 'returns values for each attribute' do
expect(subject[:repositories_count]).to eq(4)
expect(subject[:repositories_synced_count]).to eq(0)
end
it 'raises an error for invalid attributes' do
expect { subject[:testme] }.to raise_error(NoMethodError)
end
end
context 'when no values are available' do context 'when no values are available' do
it 'returns 0 for each attribute' do it 'returns 0 for each attribute' do
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag).and_return(nil) allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag_seconds).and_return(nil)
subject.attachments_count = nil subject.attachments_count = nil
subject.attachments_synced_count = nil subject.attachments_synced_count = nil
subject.attachments_failed_count = nil subject.attachments_failed_count = nil
...@@ -266,11 +277,11 @@ describe GeoNodeStatus do ...@@ -266,11 +277,11 @@ describe GeoNodeStatus do
subject.repositories_synced_count = nil subject.repositories_synced_count = nil
subject.repositories_failed_count = nil subject.repositories_failed_count = nil
subject.last_event_id = nil subject.last_event_id = nil
subject.last_event_date = nil subject.last_event_timestamp = nil
subject.cursor_last_event_id = nil subject.cursor_last_event_id = nil
subject.cursor_last_event_date = nil subject.cursor_last_event_timestamp = nil
expect(subject.db_replication_lag).to be_nil expect(subject.db_replication_lag_seconds).to be_nil
expect(subject.repositories_count).to be_zero expect(subject.repositories_count).to be_zero
expect(subject.repositories_synced_count).to be_zero expect(subject.repositories_synced_count).to be_zero
expect(subject.repositories_synced_in_percentage).to be_zero expect(subject.repositories_synced_in_percentage).to be_zero
...@@ -284,9 +295,9 @@ describe GeoNodeStatus do ...@@ -284,9 +295,9 @@ describe GeoNodeStatus do
expect(subject.attachments_failed_count).to be_zero expect(subject.attachments_failed_count).to be_zero
expect(subject.attachments_synced_in_percentage).to be_zero expect(subject.attachments_synced_in_percentage).to be_zero
expect(subject.last_event_id).to be_nil expect(subject.last_event_id).to be_nil
expect(subject.last_event_date).to be_nil expect(subject.last_event_timestamp).to be_nil
expect(subject.cursor_last_event_id).to be_nil expect(subject.cursor_last_event_id).to be_nil
expect(subject.cursor_last_event_date).to be_nil expect(subject.cursor_last_event_timestamp).to be_nil
end end
end end
end end
require 'spec_helper'
describe Geo::MetricsUpdateService, :geo do
include ::EE::GeoHelpers
subject { described_class.new }
let(:timestamp) { Time.now.to_i }
before do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(true)
end
describe '#execute' do
before do
data = {
health: 'OK',
db_replication_lag_seconds: 0,
repositories_count: 10,
repositories_synced_count: 1,
repositories_failed_count: 2,
lfs_objects_count: 100,
lfs_objects_synced_count: 50,
lfs_objects_failed_count: 12,
attachments_count: 30,
attachments_synced_count: 30,
attachments_failed_count: 25,
last_event_id: 2,
last_event_timestamp: timestamp,
cursor_last_event_id: 1,
cursor_last_event_timestamp: timestamp
}
request = double(success?: true, parsed_response: data.stringify_keys, code: 200)
allow(Geo::NodeStatusService).to receive(:get).and_return(request)
end
context 'when node is the primary' do
set(:primary) { create(:geo_node, :primary) }
set(:secondary) { create(:geo_node) }
set(:another_secondary) { create(:geo_node) }
before do
stub_current_geo_node(primary)
end
it 'attempts to retrieve metrics from all nodes' do
subject.execute
expect(Gitlab::Metrics.provide_metric(:geo_db_replication_lag_seconds).values.count).to eq(2)
expect(Gitlab::Metrics.provide_metric(:geo_repositories).values.count).to eq(2)
expect(Gitlab::Metrics.provide_metric(:geo_repositories).get({ url: secondary.url })).to eq(10)
expect(Gitlab::Metrics.provide_metric(:geo_repositories).get({ url: secondary.url })).to eq(10)
end
end
context 'when node is a secondary' do
set(:secondary) { create(:geo_node) }
subject { described_class.new }
before do
stub_current_geo_node(secondary)
end
it 'adds gauges for various metrics' do
subject.execute
expect(metric_value(:geo_db_replication_lag_seconds)).to eq(0)
expect(metric_value(:geo_repositories)).to eq(10)
expect(metric_value(:geo_repositories_synced)).to eq(1)
expect(metric_value(:geo_repositories_failed)).to eq(2)
expect(metric_value(:geo_lfs_objects)).to eq(100)
expect(metric_value(:geo_lfs_objects_synced)).to eq(50)
expect(metric_value(:geo_lfs_objects_failed)).to eq(12)
expect(metric_value(:geo_attachments)).to eq(30)
expect(metric_value(:geo_attachments_synced)).to eq(30)
expect(metric_value(:geo_attachments_failed)).to eq(25)
expect(metric_value(:geo_last_event_id)).to eq(2)
expect(metric_value(:geo_last_event_timestamp)).to eq(timestamp.to_i)
expect(metric_value(:geo_cursor_last_event_id)).to eq(1)
expect(metric_value(:geo_cursor_last_event_timestamp)).to eq(timestamp.to_i)
expect(metric_value(:geo_status_last_updated_timestamp)).to be_truthy
end
it 'increments a counter when metrics fail to retrieve' do
allow(subject).to receive(:node_status).and_return(GeoNodeStatus.new(success: false))
# Run once to get the gauge set
subject.execute
expect { subject.execute }.to change { metric_value(:geo_status_failed_total) }.by(1)
end
def metric_value(metric_name)
Gitlab::Metrics.provide_metric(metric_name).get({ url: secondary.url })
end
end
end
end
...@@ -6,7 +6,7 @@ describe Geo::NodeStatusService do ...@@ -6,7 +6,7 @@ describe Geo::NodeStatusService do
subject { described_class.new } subject { described_class.new }
describe 'KEYS' do describe '#status_keys' do
it 'matches the serializer keys' do it 'matches the serializer keys' do
exceptions = %w[ exceptions = %w[
id id
...@@ -22,7 +22,7 @@ describe Geo::NodeStatusService do ...@@ -22,7 +22,7 @@ describe Geo::NodeStatusService do
.keys .keys
.map(&:to_s) - exceptions .map(&:to_s) - exceptions
expect(described_class::KEYS).to match_array(expected) expect(subject.status_keys).to match_array(expected)
end end
end end
...@@ -41,7 +41,7 @@ describe Geo::NodeStatusService do ...@@ -41,7 +41,7 @@ describe Geo::NodeStatusService do
it 'parses a 200 response' do it 'parses a 200 response' do
data = { health: 'OK', data = { health: 'OK',
db_replication_lag: 0, db_replication_lag_seconds: 0,
repositories_count: 10, repositories_count: 10,
repositories_synced_count: 1, repositories_synced_count: 1,
repositories_failed_count: 2, repositories_failed_count: 2,
...@@ -52,15 +52,16 @@ describe Geo::NodeStatusService do ...@@ -52,15 +52,16 @@ describe Geo::NodeStatusService do
attachments_synced_count: 30, attachments_synced_count: 30,
attachments_failed_count: 25, attachments_failed_count: 25,
last_event_id: 2, last_event_id: 2,
last_event_date: Time.now, last_event_timestamp: Time.now.to_i,
cursor_last_event_id: 1, cursor_last_event_id: 1,
cursor_last_event_date: Time.now } cursor_last_event_timestamp: Time.now.to_i }
request = double(success?: true, parsed_response: data.stringify_keys, code: 200) request = double(success?: true, parsed_response: data.stringify_keys, code: 200)
allow(described_class).to receive(:get).and_return(request) allow(described_class).to receive(:get).and_return(request)
status = subject.call(secondary) status = subject.call(secondary)
expect(status).to have_attributes(data) expect(status).to have_attributes(data)
expect(status.success).to be true
end end
it 'omits full response text in status' do it 'omits full response text in status' do
...@@ -73,6 +74,7 @@ describe Geo::NodeStatusService do ...@@ -73,6 +74,7 @@ describe Geo::NodeStatusService do
status = subject.call(secondary) status = subject.call(secondary)
expect(status.health).to eq("Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\n") expect(status.health).to eq("Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\n")
expect(status.success).to be false
end end
it 'alerts on bad SSL certficate' do it 'alerts on bad SSL certficate' do
......
require 'rails_helper'
RSpec.describe Geo::MetricsUpdateWorker, :geo do
include ::EE::GeoHelpers
subject { described_class.new }
describe '#perform' do
let(:geo_node_key) { create(:geo_node_key) }
let(:secondary) { create(:geo_node, geo_node_key: geo_node_key) }
before do
stub_current_geo_node(secondary)
end
it 'does not execute when Prometheus metrics are disabled' do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(false)
expect(Geo::MetricsUpdateService).not_to receive(:new)
subject.perform
end
it 'executes when Prometheus metrics are enabled' do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(true)
expect(Geo::MetricsUpdateService).to receive(:new).and_call_original
subject.perform
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment