Commit 1987ad53 authored by Stan Hu's avatar Stan Hu Committed by Nick Thomas

Add support for logging Prometheus metrics for Geo

parent 715ea16f
......@@ -58,8 +58,8 @@ class GeoNodeStatus {
this.setHealthStatus(status.healthy);
// Replication lag can be nil if the secondary isn't actually streaming
if (status.db_replication_lag) {
const parsedTime = parseSeconds(status.db_replication_lag, {
if (status.db_replication_lag_seconds !== null && status.db_replication_lag_seconds >= 0) {
const parsedTime = parseSeconds(status.db_replication_lag_seconds, {
hoursPerDay: 24,
daysPerWeek: 7,
});
......@@ -96,8 +96,17 @@ class GeoNodeStatus {
this.$attachmentsSynced.text(attachmentText);
this.$attachmentsFailed.text(attachmentFailedText);
const eventDate = gl.utils.formatDate(new Date(status.last_event_date));
const cursorDate = gl.utils.formatDate(new Date(status.cursor_last_event_date));
let eventDate = 'N/A';
let cursorDate = 'N/A';
if (status.last_event_timestamp !== null) {
eventDate = gl.utils.formatDate(new Date(status.last_event_timestamp * 1000));
}
if (status.cursor_last_event_timestamp !== null) {
cursorDate = gl.utils.formatDate(new Date(status.cursor_last_event_timestamp * 1000));
}
this.$lastEventSeen.text(`${status.last_event_id} (${eventDate})`);
this.$lastCursorEvent.text(`${status.cursor_last_event_id} (${cursorDate})`);
if (status.health === 'Healthy') {
......
class GeoNodeStatus
include ActiveModel::Model
attr_accessor :id
attr_accessor :id, :success
attr_writer :health
def health
......@@ -14,14 +14,14 @@ class GeoNodeStatus
health.blank?
end
def db_replication_lag
return @db_replication_lag if defined?(@db_replication_lag)
def db_replication_lag_seconds
return @db_replication_lag_seconds if defined?(@db_replication_lag_seconds)
@db_replication_lag = Gitlab::Geo::HealthCheck.db_replication_lag if Gitlab::Geo.secondary?
@db_replication_lag_seconds = Gitlab::Geo::HealthCheck.db_replication_lag_seconds if Gitlab::Geo.secondary?
end
def db_replication_lag=(value)
@db_replication_lag = value
def db_replication_lag_seconds=(value)
@db_replication_lag_seconds = value
end
def last_event_id
......@@ -32,12 +32,12 @@ class GeoNodeStatus
@last_event_id = value
end
def last_event_date
@last_event_date ||= Geo::EventLog.latest_event&.created_at
def last_event_timestamp
@last_event_timestamp ||= Geo::EventLog.latest_event&.created_at&.to_i
end
def last_event_date=(value)
@last_event_date = value
def last_event_timestamp=(value)
@last_event_timestamp = value
end
def cursor_last_event_id
......@@ -50,16 +50,16 @@ class GeoNodeStatus
@cursor_last_event_id = value
end
def cursor_last_event_date
def cursor_last_event_timestamp
event_id = cursor_last_event_id
return unless event_id
@cursor_last_event_date ||= Geo::EventLog.find_by(id: event_id)&.created_at
@cursor_last_event_timestamp ||= Geo::EventLog.find_by(id: event_id)&.created_at&.to_i
end
def cursor_last_event_date=(value)
@cursor_last_event_date = value
def cursor_last_event_timestamp=(value)
@cursor_last_event_timestamp = value
end
def repositories_count
......@@ -159,6 +159,10 @@ class GeoNodeStatus
sync_percentage(attachments_count, attachments_synced_count)
end
def [](key)
public_send(key) # rubocop:disable GitlabSecurity/PublicSend
end
private
def sync_percentage(total, synced)
......
......@@ -15,7 +15,7 @@ class GeoNodeStatusEntity < Grape::Entity
number_to_percentage(node.attachments_synced_in_percentage, precision: 2)
end
expose :db_replication_lag
expose :db_replication_lag_seconds
expose :lfs_objects_count
expose :lfs_objects_synced_count
......@@ -32,7 +32,7 @@ class GeoNodeStatusEntity < Grape::Entity
end
expose :last_event_id
expose :last_event_date
expose :last_event_timestamp
expose :cursor_last_event_id
expose :cursor_last_event_date
expose :cursor_last_event_timestamp
end
module Geo
class MetricsUpdateService
METRIC_PREFIX = 'geo_'.freeze
def execute
return unless Gitlab::Geo.enabled?
if Gitlab::Geo.primary?
fetch_secondary_geo_nodes_metrics
else
fetch_current_geo_node_metrics
end
end
private
def fetch_secondary_geo_nodes_metrics
Gitlab::Geo.secondary_nodes.find_each { |node| fetch_geo_node_metrics(node) }
end
def fetch_current_geo_node_metrics
fetch_geo_node_metrics(Gitlab::Geo.current_node)
end
def fetch_geo_node_metrics(node)
status = node_status(node)
unless status.success
increment_failed_status_counter(node)
return
end
NodeStatusService::STATUS_DATA.each do |key, docstring|
value = status[key]
next unless value.is_a?(Integer)
gauge = Gitlab::Metrics.gauge(gauge_metric_name(key), docstring, {}, :max)
gauge.set(metric_labels(node), value)
end
set_last_updated_at(node)
end
def node_status(node)
NodeStatusService.new.call(node)
end
def set_last_updated_at(node)
gauge = Gitlab::Metrics.gauge(
:geo_status_last_updated_timestamp,
'UNIX timestamp of last time Geo node status was updated internally',
{},
:max)
gauge.set(metric_labels(node), Time.now.to_i)
end
def increment_failed_status_counter(node)
failed_status_counter(node).increment
end
def failed_status_counter(node)
Gitlab::Metrics.counter(
:geo_status_failed_total,
'Total number of times status for Geo node failed to retrieve',
metric_labels(node))
end
def gauge_metric_name(name)
# Prometheus naming conventions in
# https://prometheus.io/docs/instrumenting/writing_exporters/#naming says
# that _count and _total should be reserved for counters
base_name = name.to_s.gsub(/(_count|_total)$/, '')
(METRIC_PREFIX + base_name).to_sym
end
def metric_labels(node)
{ url: node.url }
end
end
end
......@@ -3,53 +3,59 @@ module Geo
include Gitlab::CurrentSettings
include HTTParty
KEYS = %w(
health
db_replication_lag
repositories_count
repositories_synced_count
repositories_failed_count
lfs_objects_count
lfs_objects_synced_count
lfs_objects_failed_count
attachments_count
attachments_synced_count
attachments_failed_count
last_event_id
last_event_date
cursor_last_event_id
cursor_last_event_date
).freeze
STATUS_DATA = {
health: 'Summary of health status',
db_replication_lag_seconds: 'Database replication lag (seconds)',
repositories_count: 'Total number of repositories available on primary',
repositories_synced_count: 'Number of repositories synced on secondary',
repositories_failed_count: 'Number of repositories failed to sync on secondary',
lfs_objects_count: 'Total number of LFS objects available on primary',
lfs_objects_synced_count: 'Number of LFS objects synced on secondary',
lfs_objects_failed_count: 'Number of LFS objects failed to sync on secondary',
attachments_count: 'Total number of file attachments available on primary',
attachments_synced_count: 'Number of attachments synced on secondary',
attachments_failed_count: 'Number of attachments failed to sync on secondary',
last_event_id: 'Database ID of the latest event log entry on the primary',
last_event_timestamp: 'UNIX timestamp of the latest event log entry on the primary',
cursor_last_event_id: 'Last database ID of the event log processed by the secondary',
cursor_last_event_timestamp: 'Last UNIX timestamp of the event log processed by the secondary'
}.freeze
def call(geo_node)
values =
begin
response = self.class.get(geo_node.status_url, headers: headers, timeout: timeout)
data = { id: geo_node.id }
if response.success?
response.parsed_response.values_at(*KEYS)
else
message = "Could not connect to Geo node - HTTP Status Code: #{response.code} #{response.message}"
payload = response.parsed_response
details =
if payload.is_a?(Hash)
payload['message']
else
# The return value can be a giant blob of HTML; ignore it
''
end
begin
response = self.class.get(geo_node.status_url, headers: headers, timeout: timeout)
data[:success] = response.success?
Array([message, details].compact.join("\n"))
end
rescue Gitlab::Geo::GeoNodeNotFoundError
['This GitLab instance does not appear to be configured properly as a Geo node. Make sure the URLs are using the correct fully-qualified domain names.']
rescue OpenSSL::Cipher::CipherError
['Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.']
rescue HTTParty::Error, Timeout::Error, SocketError, SystemCallError, OpenSSL::SSL::SSLError => e
[e.message]
if response.success?
data.merge!(response.parsed_response.symbolize_keys.slice(*STATUS_DATA.keys))
else
message = "Could not connect to Geo node - HTTP Status Code: #{response.code} #{response.message}"
payload = response.parsed_response
details =
if payload.is_a?(Hash)
payload['message']
else
# The return value can be a giant blob of HTML; ignore it
''
end
data[:health] = [message, details].compact.join("\n")
end
rescue Gitlab::Geo::GeoNodeNotFoundError
data[:health] = 'This GitLab instance does not appear to be configured properly as a Geo node. Make sure the URLs are using the correct fully-qualified domain names.'
rescue OpenSSL::Cipher::CipherError
data[:health] = 'Error decrypting the Geo secret from the database. Check that the primary uses the correct db_key_base.'
rescue HTTParty::Error, Timeout::Error, SocketError, SystemCallError, OpenSSL::SSL::SSLError => e
data[:health] = e.message
end
GeoNodeStatus.new(data)
end
GeoNodeStatus.new(KEYS.zip(values).to_h.merge(id: geo_node.id))
def status_keys
STATUS_DATA.stringify_keys.keys
end
private
......
module ExclusiveLeaseGuard
extend ActiveSupport::Concern
def lease_key
@lease_key ||= self.class.name.underscore
end
def try_obtain_lease
lease = exclusive_lease.try_obtain
unless lease
log_error('Cannot obtain an exclusive lease. There must be another worker already in execution.')
return
end
begin
yield lease
ensure
release_lease(lease)
end
end
def exclusive_lease
@lease ||= Gitlab::ExclusiveLease.new(lease_key, timeout: lease_timeout)
end
def release_lease(uuid)
Gitlab::ExclusiveLease.cancel(lease_key, uuid)
end
end
module Geo
class MetricsUpdateWorker
include Sidekiq::Worker
include ExclusiveLeaseGuard
include CronjobQueue
LEASE_TIMEOUT = 5.minutes
def perform
return unless Gitlab::Metrics.prometheus_metrics_enabled?
try_obtain_lease { Geo::MetricsUpdateService.new.execute }
end
def lease_timeout
LEASE_TIMEOUT
end
end
end
---
title: Add support for logging Prometheus metrics for Geo
merge_request: !3187
author:
type: added
......@@ -241,13 +241,18 @@ production: &base
ldap_sync_worker:
cron: "30 1 * * *"
# GitLab Geo repository sync worker
# GitLab Geo metrics update worker
# NOTE: This will only take effect if Geo is enabled
geo_metrics_update_worker:
cron: "*/1 * * * *"
# GitLab Geo repository sync worker
# NOTE: This will only take effect if Geo is enabled (secondary nodes only)
geo_repository_sync_worker:
cron: "*/5 * * * *"
# GitLab Geo file download dispatch worker
# NOTE: This will only take effect if Geo is enabled
# NOTE: This will only take effect if Geo is enabled (secondary nodes only)
geo_file_download_dispatch_worker:
cron: "*/10 * * * *"
......
......@@ -443,6 +443,9 @@ Settings.cron_jobs['ldap_sync_worker']['job_class'] = 'LdapSyncWorker'
Settings.cron_jobs['ldap_group_sync_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['ldap_group_sync_worker']['cron'] ||= '0 * * * *'
Settings.cron_jobs['ldap_group_sync_worker']['job_class'] = 'LdapAllGroupsSyncWorker'
Settings.cron_jobs['geo_metrics_update_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['geo_metrics_update_worker']['cron'] ||= '*/1 * * * *'
Settings.cron_jobs['geo_metrics_update_worker']['job_class'] ||= 'Geo::MetricsUpdateWorker'
Settings.cron_jobs['geo_repository_sync_worker'] ||= Settingslogic.new({})
Settings.cron_jobs['geo_repository_sync_worker']['cron'] ||= '*/1 * * * *'
Settings.cron_jobs['geo_repository_sync_worker']['job_class'] ||= 'Geo::RepositorySyncWorker'
......
......@@ -22,7 +22,7 @@ collect metrics from this endpoint. We recommend setting up another Prometheus
server, because the embedded server configuration is overwritten once every
[reconfigure of GitLab][reconfigure]. In the future this will not be required.
## Metrics available
## Unicorn Metrics available
In this experimental phase, only a few metrics are available:
......@@ -48,6 +48,30 @@ In this experimental phase, only a few metrics are available:
| filesystem_circuitbreaker_latency_seconds | Histogram | 9.5 | Latency of the stat check the circuitbreaker uses to probe a shard |
| filesystem_circuitbreaker | Gauge | 9.5 | Wether or not the circuit for a certain shard is broken or not |
## Sidekiq Metrics available
Sidekiq jobs may also gather metrics, and these metrics can be accessed if the Sidekiq exporter is enabled (e.g. via
the `monitoring.sidekiq_exporter` configuration option in `gitlab.yml`.
| Metric | Type | Since | Description | Labels |
|:--------------------------------- |:--------- |:----- |:----------- |:------ |
|geo_db_replication_lag_seconds | Gauge | 10.2 | Database replication lag (seconds) | url
|geo_repositories | Gauge | 10.2 | Total number of repositories available on primary | url
|geo_repositories_synced | Gauge | 10.2 | Number of repositories synced on secondary | url
|geo_repositories_failed | Gauge | 10.2 | Number of repositories failed to sync on secondary | url
|geo_lfs_objects | Gauge | 10.2 | Total number of LFS objects available on primary | url
|geo_lfs_objects_synced | Gauge | 10.2 | Number of LFS objects synced on secondary | url
|geo_lfs_objects_failed | Gauge | 10.2 | Number of LFS objects failed to sync on secondary | url
|geo_attachments | Gauge | 10.2 | Total number of file attachments available on primary | url
|geo_attachments_synced | Gauge | 10.2 | Number of attachments synced on secondary | url
|geo_attachments_failed | Gauge | 10.2 | Number of attachments failed to sync on secondary | url
|geo_last_event_id | Gauge | 10.2 | Database ID of the latest event log entry on the primary | url
|geo_last_event_timestamp | Gauge | 10.2 | UNIX timestamp of the latest event log entry on the primary | url
|geo_cursor_last_event_id | Gauge | 10.2 | Last database ID of the event log processed by the secondary | url
|geo_cursor_last_event_timestamp | Gauge | 10.2 | Last UNIX timestamp of the event log processed by the secondary | url
|geo_status_last_updated_timestamp | Gauge | 10.2 | Last timestamp when the status was successfully updated | url
|geo_status_failed_total | Counter | 10.2 | Number of times retrieving the status from the Geo Node failed | url
## Metrics shared directory
GitLab's Prometheus client requires a directory to store metrics data shared between multi-process services.
......
......@@ -1010,7 +1010,7 @@ module API
class GeoNodeStatus < Grape::Entity
expose :id
expose :db_replication_lag
expose :db_replication_lag_seconds
expose :health
expose :healthy?, as: :healthy
expose :repositories_count
......@@ -1023,9 +1023,9 @@ module API
expose :attachments_synced_count
expose :attachments_failed_count
expose :last_event_id
expose :last_event_date
expose :last_event_timestamp
expose :cursor_last_event_id
expose :cursor_last_event_date
expose :cursor_last_event_timestamp
end
class PersonalAccessToken < Grape::Entity
......
......@@ -12,6 +12,7 @@ module Gitlab
geo_oauth_application
).freeze
COMMON_JOBS = %i(metrics_update_job).freeze
SECONDARY_JOBS = %i(repository_sync_job file_download_job).freeze
FDW_SCHEMA = 'gitlab_secondary'.freeze
......@@ -95,6 +96,10 @@ module Gitlab
Sidekiq::Cron::Job.find('geo_file_download_dispatch_worker')
end
def self.metrics_update_job
Sidekiq::Cron::Job.find('geo_metrics_update_worker')
end
def self.configure_primary_jobs!
self.enable_all_cron_jobs!
SECONDARY_JOBS.each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend
......@@ -102,11 +107,11 @@ module Gitlab
def self.configure_secondary_jobs!
self.disable_all_cron_jobs!
SECONDARY_JOBS.each { |job| self.__send__(job).try(:enable!) } # rubocop:disable GitlabSecurity/PublicSend
(COMMON_JOBS + SECONDARY_JOBS).each { |job| self.__send__(job).try(:enable!) } # rubocop:disable GitlabSecurity/PublicSend
end
def self.disable_all_geo_jobs!
SECONDARY_JOBS.each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend
(COMMON_JOBS + SECONDARY_JOBS).each { |job| self.__send__(job).try(:disable!) } # rubocop:disable GitlabSecurity/PublicSend
end
def self.disable_all_cron_jobs!
......
......@@ -7,7 +7,7 @@ module Gitlab
return '' unless Gitlab::Geo.secondary?
return 'The Geo database configuration file is missing.' unless Gitlab::Geo.geo_database_configured?
return 'The Geo node has a database that is not configured for streaming replication with the primary node.' unless self.database_secondary?
return 'The Geo node does not appear to be replicating data from the primary node.' unless self.db_replication_lag.present?
return 'The Geo node does not appear to be replicating data from the primary node.' unless self.db_replication_lag_seconds.present?
database_version = self.get_database_version.to_i
migration_version = self.get_migration_version.to_i
......@@ -60,9 +60,10 @@ module Gitlab
.fetch('pg_is_in_recovery') == 't'
end
def self.db_replication_lag
def self.db_replication_lag_seconds
# Obtain the replication lag in seconds
ActiveRecord::Base.connection.execute('
lag =
ActiveRecord::Base.connection.execute('
SELECT CASE
WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location()
THEN 0
......@@ -72,6 +73,8 @@ module Gitlab
AS replication_lag')
.first
.fetch('replication_lag')
lag.present? ? lag.to_i : lag
end
end
end
......
......@@ -273,9 +273,9 @@ describe Admin::GeoNodesController, :postgresql do
repositories_synced_count: 5,
repositories_failed_count: 0,
last_event_id: 2,
last_event_date: Time.now.iso8601,
last_event_timestamp: Time.now.to_i,
cursor_last_event_id: 1,
cursor_last_event_date: Time.now.iso8601
cursor_last_event_timestamp: Time.now.to_i
)
end
......
......@@ -10,14 +10,14 @@
"lfs_objects_count",
"lfs_objects_failed_count",
"lfs_objects_synced_count",
"db_replication_lag",
"db_replication_lag_seconds",
"repositories_count",
"repositories_failed_count",
"repositories_synced_count",
"last_event_id",
"last_event_date",
"last_event_timestamp",
"cursor_last_event_id",
"cursor_last_event_date"
"cursor_last_event_timestamp"
],
"properties" : {
"id": { "type": "integer" },
......@@ -27,7 +27,7 @@
"attachments_failed_count": { "type": "integer" },
"attachments_synced_count": { "type": "integer" },
"attachments_synced_in_percentage": { "type": "string" },
"db_replication_lag": { "type": ["integer", "null"] },
"db_replication_lag_seconds": { "type": ["integer", "null"] },
"lfs_objects_count": { "type": "integer" },
"lfs_objects_failed_count": { "type": "integer" },
"lfs_objects_synced_count": { "type": "integer" },
......@@ -37,9 +37,9 @@
"repositories_synced_count": { "type": "integer" },
"repositories_synced_in_percentage": { "type": "string" },
"last_event_id": { "type": ["integer", "null"] },
"last_event_date": { "type": ["string", "null"] },
"last_event_timestamp": { "type": ["integer", "null"] },
"cursor_last_event_id": { "type": ["integer", "null"] },
"cursor_last_event_date": { "type": ["string", "null"] }
"cursor_last_event_timestamp": { "type": ["integer", "null"] }
},
"additionalProperties": false
}
......@@ -15,7 +15,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
allow(described_class).to receive(:database_secondary?).and_return(true)
allow(described_class).to receive(:get_database_version).and_return('20170101')
allow(described_class).to receive(:get_migration_version).and_return('20170201')
allow(described_class).to receive(:db_replication_lag).and_return(0)
allow(described_class).to receive(:db_replication_lag_seconds).and_return(0)
message = subject.perform_checks
......@@ -54,7 +54,7 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when Geo database version does not match the latest migration version' do
allow(described_class).to receive(:database_secondary?).and_return(true)
allow(subject).to receive(:get_database_version) { 1 }
allow(described_class).to receive(:db_replication_lag).and_return(0)
allow(described_class).to receive(:db_replication_lag_seconds).and_return(0)
expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/)
end
......@@ -62,14 +62,14 @@ describe Gitlab::Geo::HealthCheck, :postgresql do
it 'returns an error when latest migration version does not match the Geo database version' do
allow(described_class).to receive(:database_secondary?).and_return(true)
allow(subject).to receive(:get_migration_version) { 1 }
allow(described_class).to receive(:db_replication_lag).and_return(0)
allow(described_class).to receive(:db_replication_lag_seconds).and_return(0)
expect(subject.perform_checks).to match(/Current Geo database version \([0-9]+\) does not match latest migration \([0-9]+\)/)
end
it 'returns an error when replication lag is not present' do
allow(described_class).to receive(:database_secondary?).and_return(true)
allow(described_class).to receive(:db_replication_lag).and_return(nil)
allow(described_class).to receive(:db_replication_lag_seconds).and_return(nil)
expect(subject.perform_checks).to match(/The Geo node does not appear to be replicating data from the primary node/)
end
......
......@@ -186,7 +186,7 @@ describe Gitlab::Geo, :geo do
end
describe '.configure_cron_jobs!' do
JOBS = %w(ldap_test geo_repository_sync_worker geo_file_download_dispatch_worker).freeze
JOBS = %w(ldap_test geo_repository_sync_worker geo_file_download_dispatch_worker geo_metrics_update_worker).freeze
def init_cron_job(job_name, class_name)
job = Sidekiq::Cron::Job.new(
......@@ -211,6 +211,7 @@ describe Gitlab::Geo, :geo do
expect(described_class.repository_sync_job).not_to be_enabled
expect(described_class.file_download_job).not_to be_enabled
expect(described_class.metrics_update_job).to be_enabled
expect(Sidekiq::Cron::Job.find('ldap_test')).to be_enabled
end
......@@ -222,15 +223,17 @@ describe Gitlab::Geo, :geo do
expect(Sidekiq::Cron::Job.find('ldap_test')).not_to be_enabled
expect(described_class.repository_sync_job).to be_enabled
expect(described_class.file_download_job).to be_enabled
expect(described_class.metrics_update_job).to be_enabled
end
it 'deactivates all jobs when Geo is not active' do
GeoNode.update_all(enabled: false)
stub_current_geo_node(nil)
described_class.configure_cron_jobs!
expect(described_class.repository_sync_job).not_to be_enabled
expect(described_class.file_download_job).not_to be_enabled
expect(described_class.metrics_update_job).not_to be_enabled
expect(Sidekiq::Cron::Job.find('ldap_test')).to be_enabled
end
......
......@@ -124,18 +124,18 @@ describe GeoNodeStatus do
end
end
describe '#db_replication_lag' do
describe '#db_replication_lag_seconds' do
it 'returns the set replication lag if secondary' do
allow(Gitlab::Geo).to receive(:secondary?).and_return(true)
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag).and_return(1000)
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag_seconds).and_return(1000)
expect(subject.db_replication_lag).to eq(1000)
expect(subject.db_replication_lag_seconds).to eq(1000)
end
it "doesn't attempt to set replication lag if primary" do
expect(Gitlab::Geo::HealthCheck).not_to receive(:db_replication_lag)
expect(Gitlab::Geo::HealthCheck).not_to receive(:db_replication_lag_seconds)
expect(subject.db_replication_lag).to eq(nil)
expect(subject.db_replication_lag_seconds).to eq(nil)
end
end
......@@ -217,25 +217,25 @@ describe GeoNodeStatus do
end
end
describe '#last_event_id and #last_event_date' do
describe '#last_event_id and #last_event_timestamp' do
it 'returns nil when no events are available' do
expect(subject.last_event_id).to be_nil
expect(subject.last_event_date).to be_nil
expect(subject.last_event_timestamp).to be_nil
end
it 'returns the latest event' do
created_at = Date.new(2017, 10, 22)
created_at = Date.today.to_time(:utc)
event = create(:geo_event_log, created_at: created_at)
expect(subject.last_event_id).to eq(event.id)
expect(subject.last_event_date).to eq(created_at)
expect(subject.last_event_timestamp).to eq(created_at.to_i)
end
end
describe '#cursor_last_event_id and #cursor_last_event_date' do
describe '#cursor_last_event_id and #cursor_last_event_timestamp' do
it 'returns nil when no events are available' do
expect(subject.cursor_last_event_id).to be_nil
expect(subject.cursor_last_event_date).to be_nil
expect(subject.cursor_last_event_timestamp).to be_nil
end
it 'returns the latest event ID if secondary' do
......@@ -248,14 +248,25 @@ describe GeoNodeStatus do
it "doesn't attempt to retrieve cursor if primary" do
create(:geo_event_log_state)
expect(subject.cursor_last_event_date).to eq(nil)
expect(subject.cursor_last_event_timestamp).to eq(nil)
expect(subject.cursor_last_event_id).to eq(nil)
end
end
describe '#[]' do
it 'returns values for each attribute' do
expect(subject[:repositories_count]).to eq(4)
expect(subject[:repositories_synced_count]).to eq(0)
end
it 'raises an error for invalid attributes' do
expect { subject[:testme] }.to raise_error(NoMethodError)
end
end
context 'when no values are available' do
it 'returns 0 for each attribute' do
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag).and_return(nil)
allow(Gitlab::Geo::HealthCheck).to receive(:db_replication_lag_seconds).and_return(nil)
subject.attachments_count = nil
subject.attachments_synced_count = nil
subject.attachments_failed_count = nil
......@@ -266,11 +277,11 @@ describe GeoNodeStatus do
subject.repositories_synced_count = nil
subject.repositories_failed_count = nil
subject.last_event_id = nil
subject.last_event_date = nil
subject.last_event_timestamp = nil
subject.cursor_last_event_id = nil
subject.cursor_last_event_date = nil
subject.cursor_last_event_timestamp = nil
expect(subject.db_replication_lag).to be_nil
expect(subject.db_replication_lag_seconds).to be_nil
expect(subject.repositories_count).to be_zero
expect(subject.repositories_synced_count).to be_zero
expect(subject.repositories_synced_in_percentage).to be_zero
......@@ -284,9 +295,9 @@ describe GeoNodeStatus do
expect(subject.attachments_failed_count).to be_zero
expect(subject.attachments_synced_in_percentage).to be_zero
expect(subject.last_event_id).to be_nil
expect(subject.last_event_date).to be_nil
expect(subject.last_event_timestamp).to be_nil
expect(subject.cursor_last_event_id).to be_nil
expect(subject.cursor_last_event_date).to be_nil
expect(subject.cursor_last_event_timestamp).to be_nil
end
end
end
require 'spec_helper'
describe Geo::MetricsUpdateService, :geo do
include ::EE::GeoHelpers
subject { described_class.new }
let(:timestamp) { Time.now.to_i }
before do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(true)
end
describe '#execute' do
before do
data = {
health: 'OK',
db_replication_lag_seconds: 0,
repositories_count: 10,
repositories_synced_count: 1,
repositories_failed_count: 2,
lfs_objects_count: 100,
lfs_objects_synced_count: 50,
lfs_objects_failed_count: 12,
attachments_count: 30,
attachments_synced_count: 30,
attachments_failed_count: 25,
last_event_id: 2,
last_event_timestamp: timestamp,
cursor_last_event_id: 1,
cursor_last_event_timestamp: timestamp
}
request = double(success?: true, parsed_response: data.stringify_keys, code: 200)
allow(Geo::NodeStatusService).to receive(:get).and_return(request)
end
context 'when node is the primary' do
set(:primary) { create(:geo_node, :primary) }
set(:secondary) { create(:geo_node) }
set(:another_secondary) { create(:geo_node) }
before do
stub_current_geo_node(primary)
end
it 'attempts to retrieve metrics from all nodes' do
subject.execute
expect(Gitlab::Metrics.provide_metric(:geo_db_replication_lag_seconds).values.count).to eq(2)
expect(Gitlab::Metrics.provide_metric(:geo_repositories).values.count).to eq(2)
expect(Gitlab::Metrics.provide_metric(:geo_repositories).get({ url: secondary.url })).to eq(10)
expect(Gitlab::Metrics.provide_metric(:geo_repositories).get({ url: secondary.url })).to eq(10)
end
end
context 'when node is a secondary' do
set(:secondary) { create(:geo_node) }
subject { described_class.new }
before do
stub_current_geo_node(secondary)
end
it 'adds gauges for various metrics' do
subject.execute
expect(metric_value(:geo_db_replication_lag_seconds)).to eq(0)
expect(metric_value(:geo_repositories)).to eq(10)
expect(metric_value(:geo_repositories_synced)).to eq(1)
expect(metric_value(:geo_repositories_failed)).to eq(2)
expect(metric_value(:geo_lfs_objects)).to eq(100)
expect(metric_value(:geo_lfs_objects_synced)).to eq(50)
expect(metric_value(:geo_lfs_objects_failed)).to eq(12)
expect(metric_value(:geo_attachments)).to eq(30)
expect(metric_value(:geo_attachments_synced)).to eq(30)
expect(metric_value(:geo_attachments_failed)).to eq(25)
expect(metric_value(:geo_last_event_id)).to eq(2)
expect(metric_value(:geo_last_event_timestamp)).to eq(timestamp.to_i)
expect(metric_value(:geo_cursor_last_event_id)).to eq(1)
expect(metric_value(:geo_cursor_last_event_timestamp)).to eq(timestamp.to_i)
expect(metric_value(:geo_status_last_updated_timestamp)).to be_truthy
end
it 'increments a counter when metrics fail to retrieve' do
allow(subject).to receive(:node_status).and_return(GeoNodeStatus.new(success: false))
# Run once to get the gauge set
subject.execute
expect { subject.execute }.to change { metric_value(:geo_status_failed_total) }.by(1)
end
def metric_value(metric_name)
Gitlab::Metrics.provide_metric(metric_name).get({ url: secondary.url })
end
end
end
end
......@@ -6,7 +6,7 @@ describe Geo::NodeStatusService do
subject { described_class.new }
describe 'KEYS' do
describe '#status_keys' do
it 'matches the serializer keys' do
exceptions = %w[
id
......@@ -22,7 +22,7 @@ describe Geo::NodeStatusService do
.keys
.map(&:to_s) - exceptions
expect(described_class::KEYS).to match_array(expected)
expect(subject.status_keys).to match_array(expected)
end
end
......@@ -41,7 +41,7 @@ describe Geo::NodeStatusService do
it 'parses a 200 response' do
data = { health: 'OK',
db_replication_lag: 0,
db_replication_lag_seconds: 0,
repositories_count: 10,
repositories_synced_count: 1,
repositories_failed_count: 2,
......@@ -52,15 +52,16 @@ describe Geo::NodeStatusService do
attachments_synced_count: 30,
attachments_failed_count: 25,
last_event_id: 2,
last_event_date: Time.now,
last_event_timestamp: Time.now.to_i,
cursor_last_event_id: 1,
cursor_last_event_date: Time.now }
cursor_last_event_timestamp: Time.now.to_i }
request = double(success?: true, parsed_response: data.stringify_keys, code: 200)
allow(described_class).to receive(:get).and_return(request)
status = subject.call(secondary)
expect(status).to have_attributes(data)
expect(status.success).to be true
end
it 'omits full response text in status' do
......@@ -73,6 +74,7 @@ describe Geo::NodeStatusService do
status = subject.call(secondary)
expect(status.health).to eq("Could not connect to Geo node - HTTP Status Code: 401 Unauthorized\n")
expect(status.success).to be false
end
it 'alerts on bad SSL certficate' do
......
require 'rails_helper'
RSpec.describe Geo::MetricsUpdateWorker, :geo do
include ::EE::GeoHelpers
subject { described_class.new }
describe '#perform' do
let(:geo_node_key) { create(:geo_node_key) }
let(:secondary) { create(:geo_node, geo_node_key: geo_node_key) }
before do
stub_current_geo_node(secondary)
end
it 'does not execute when Prometheus metrics are disabled' do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(false)
expect(Geo::MetricsUpdateService).not_to receive(:new)
subject.perform
end
it 'executes when Prometheus metrics are enabled' do
allow(Gitlab::Metrics).to receive(:prometheus_metrics_enabled?).and_return(true)
expect(Geo::MetricsUpdateService).to receive(:new).and_call_original
subject.perform
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment