Commit 8927e76c authored by Stan Hu's avatar Stan Hu

Merge branch '5195-geo-add-prometheus-metrics' into 'master'

Add Prometheus metrics to track Geo autocorrect numbers

See merge request gitlab-org/gitlab-ee!6778
parents b8da28c1 e23f5f87
......@@ -11,7 +11,7 @@
#
# It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180726172057) do
ActiveRecord::Schema.define(version: 20180803001726) do
# These are extensions that must be enabled in order to support this database
enable_extension "plpgsql"
......@@ -1159,6 +1159,8 @@ ActiveRecord::Schema.define(version: 20180726172057) do
t.integer "wikis_checksum_failed_count"
t.integer "wikis_checksum_mismatch_count"
t.binary "storage_configuration_digest"
t.integer "repositories_retrying_verification_count"
t.integer "wikis_retrying_verification_count"
end
add_index "geo_node_statuses", ["geo_node_id"], name: "index_geo_node_statuses_on_geo_node_id", unique: true, using: :btree
......
......@@ -57,39 +57,41 @@ The following metrics are available:
Sidekiq jobs may also gather metrics, and these metrics can be accessed if the Sidekiq exporter is enabled (e.g. via
the `monitoring.sidekiq_exporter` configuration option in `gitlab.yml`.
| Metric | Type | Since | Description | Labels |
|:------------------------------------------- |:------- |:----- |:----------- |:------ |
| geo_db_replication_lag_seconds | Gauge | 10.2 | Database replication lag (seconds) | url
| geo_repositories | Gauge | 10.2 | Total number of repositories available on primary | url
| geo_repositories_synced | Gauge | 10.2 | Number of repositories synced on secondary | url
| geo_repositories_failed | Gauge | 10.2 | Number of repositories failed to sync on secondary | url
| geo_lfs_objects | Gauge | 10.2 | Total number of LFS objects available on primary | url
| geo_lfs_objects_synced | Gauge | 10.2 | Number of LFS objects synced on secondary | url
| geo_lfs_objects_failed | Gauge | 10.2 | Number of LFS objects failed to sync on secondary | url
| geo_attachments | Gauge | 10.2 | Total number of file attachments available on primary | url
| geo_attachments_synced | Gauge | 10.2 | Number of attachments synced on secondary | url
| geo_attachments_failed | Gauge | 10.2 | Number of attachments failed to sync on secondary | url
| geo_last_event_id | Gauge | 10.2 | Database ID of the latest event log entry on the primary | url
| geo_last_event_timestamp | Gauge | 10.2 | UNIX timestamp of the latest event log entry on the primary | url
| geo_cursor_last_event_id | Gauge | 10.2 | Last database ID of the event log processed by the secondary | url
| geo_cursor_last_event_timestamp | Gauge | 10.2 | Last UNIX timestamp of the event log processed by the secondary | url
| geo_status_failed_total | Counter | 10.2 | Number of times retrieving the status from the Geo Node failed | url
| geo_last_successful_status_check_timestamp | Gauge | 10.2 | Last timestamp when the status was successfully updated | url
| geo_lfs_objects_synced_missing_on_primary | Gauge | 10.7 | Number of LFS objects marked as synced due to the file missing on the primary | url
| geo_job_artifacts_synced_missing_on_primary | Gauge | 10.7 | Number of job artifacts marked as synced due to the file missing on the primary | url
| geo_attachments_synced_missing_on_primary | Gauge | 10.7 | Number of attachments marked as synced due to the file missing on the primary | url
| geo_repositories_checksummed_count | Gauge | 10.7 | Number of repositories checksummed on primary | url
| geo_repositories_checksum_failed_count | Gauge | 10.7 | Number of repositories failed to calculate the checksum on primary | url
| geo_wikis_checksummed_count | Gauge | 10.7 | Number of wikis checksummed on primary | url
| geo_wikis_checksum_failed_count | Gauge | 10.7 | Number of wikis failed to calculate the checksum on primary | url
| geo_repositories_verified_count | Gauge | 10.7 | Number of repositories verified on secondary | url
| geo_repositories_verification_failed_count | Gauge | 10.7 | Number of repositories failed to verify on secondary | url
| geo_repositories_checksum_mismatch_count | Gauge | 10.7 | Number of repositories that checksum mismatch on secondary | url
| geo_wikis_verified_count | Gauge | 10.7 | Number of wikis verified on secondary | url
| geo_wikis_verification_failed_count | Gauge | 10.7 | Number of wikis failed to verify on secondary | url
| geo_wikis_checksum_mismatch_count | Gauge | 10.7 | Number of wikis that checksum mismatch on secondary | url
| geo_repositories_checked_count | Gauge | 11.1 | Number of repositories that have been checked via `git fsck` | url
| geo_repositories_checked_failed_count | Gauge | 11.1 | Number of repositories that have a failure from `git fsck` | url
| Metric | Type | Since | Description | Labels |
|:-------------------------------------------- |:------- |:----- |:----------- |:------ |
| geo_db_replication_lag_seconds | Gauge | 10.2 | Database replication lag (seconds) | url
| geo_repositories | Gauge | 10.2 | Total number of repositories available on primary | url
| geo_repositories_synced | Gauge | 10.2 | Number of repositories synced on secondary | url
| geo_repositories_failed | Gauge | 10.2 | Number of repositories failed to sync on secondary | url
| geo_lfs_objects | Gauge | 10.2 | Total number of LFS objects available on primary | url
| geo_lfs_objects_synced | Gauge | 10.2 | Number of LFS objects synced on secondary | url
| geo_lfs_objects_failed | Gauge | 10.2 | Number of LFS objects failed to sync on secondary | url
| geo_attachments | Gauge | 10.2 | Total number of file attachments available on primary | url
| geo_attachments_synced | Gauge | 10.2 | Number of attachments synced on secondary | url
| geo_attachments_failed | Gauge | 10.2 | Number of attachments failed to sync on secondary | url
| geo_last_event_id | Gauge | 10.2 | Database ID of the latest event log entry on the primary | url
| geo_last_event_timestamp | Gauge | 10.2 | UNIX timestamp of the latest event log entry on the primary | url
| geo_cursor_last_event_id | Gauge | 10.2 | Last database ID of the event log processed by the secondary | url
| geo_cursor_last_event_timestamp | Gauge | 10.2 | Last UNIX timestamp of the event log processed by the secondary | url
| geo_status_failed_total | Counter | 10.2 | Number of times retrieving the status from the Geo Node failed | url
| geo_last_successful_status_check_timestamp | Gauge | 10.2 | Last timestamp when the status was successfully updated | url
| geo_lfs_objects_synced_missing_on_primary | Gauge | 10.7 | Number of LFS objects marked as synced due to the file missing on the primary | url
| geo_job_artifacts_synced_missing_on_primary | Gauge | 10.7 | Number of job artifacts marked as synced due to the file missing on the primary | url
| geo_attachments_synced_missing_on_primary | Gauge | 10.7 | Number of attachments marked as synced due to the file missing on the primary | url
| geo_repositories_checksummed_count | Gauge | 10.7 | Number of repositories checksummed on primary | url
| geo_repositories_checksum_failed_count | Gauge | 10.7 | Number of repositories failed to calculate the checksum on primary | url
| geo_wikis_checksummed_count | Gauge | 10.7 | Number of wikis checksummed on primary | url
| geo_wikis_checksum_failed_count | Gauge | 10.7 | Number of wikis failed to calculate the checksum on primary | url
| geo_repositories_verified_count | Gauge | 10.7 | Number of repositories verified on secondary | url
| geo_repositories_verification_failed_count | Gauge | 10.7 | Number of repositories failed to verify on secondary | url
| geo_repositories_checksum_mismatch_count | Gauge | 10.7 | Number of repositories that checksum mismatch on secondary | url
| geo_wikis_verified_count | Gauge | 10.7 | Number of wikis verified on secondary | url
| geo_wikis_verification_failed_count | Gauge | 10.7 | Number of wikis failed to verify on secondary | url
| geo_wikis_checksum_mismatch_count | Gauge | 10.7 | Number of wikis that checksum mismatch on secondary | url
| geo_repositories_checked_count | Gauge | 11.1 | Number of repositories that have been checked via `git fsck` | url
| geo_repositories_checked_failed_count | Gauge | 11.1 | Number of repositories that have a failure from `git fsck` | url
| geo_repositories_retrying_verification_count | Gauge | 11.2 | Number of repositories verification failures that Geo is actively trying to correct on secondary | url
| geo_wikis_retrying_verification_count | Gauge | 11.2 | Number of wikis verification failures that Geo is actively trying to correct on secondary | url
### Ruby metrics
......
......@@ -206,6 +206,8 @@ Example response:
"wikis_verification_failed_count": 3,
"wikis_verified_in_percentage": "24.39%",
"wikis_checksum_mismatch_count": 1,
"repositories_retrying_verification_count": 1,
"wikis_retrying_verification_count": 3,
"repositories_checked_count": 7,
"repositories_checked_failed_count": 2,
"repositories_checked_in_percentage": "17.07%",
......@@ -265,6 +267,8 @@ Example response:
"wikis_verification_failed_count": 3,
"wikis_verified_in_percentage": "24.39%",
"wikis_checksum_mismatch_count": 1,
"repositories_retrying_verification_count": 4,
"wikis_retrying_verification_count": 2,
"repositories_checked_count": 5,
"repositories_checked_failed_count": 1,
"repositories_checked_in_percentage": "12.20%",
......
......@@ -88,6 +88,14 @@ module Geo
Geo::ProjectRegistry.wiki_checksum_mismatch.count
end
def count_repositories_retrying_verification
Geo::ProjectRegistry.repositories_retrying_verification.count
end
def count_wikis_retrying_verification
Geo::ProjectRegistry.wikis_retrying_verification.count
end
def count_verification_failed_repositories
find_verification_failed_project_registries('repository').count
end
......
......@@ -53,6 +53,20 @@ class Geo::ProjectRegistry < Geo::BaseRegistry
where(repository_checksum_mismatch.or(wiki_checksum_mismatch))
end
def self.repositories_retrying_verification
where(
arel_table[:repository_verification_retry_count].gt(0)
.and(arel_table[:resync_repository].eq(true))
)
end
def self.wikis_retrying_verification
where(
arel_table[:wiki_verification_retry_count].gt(0)
.and(arel_table[:resync_wiki].eq(true))
)
end
def self.retry_due
where(
arel_table[:repository_retry_at].lt(Time.now)
......
......@@ -77,7 +77,9 @@ class GeoNodeStatus < ActiveRecord::Base
hashed_storage_migrated_max_id: 'Highest ID present in projects migrated to hashed storage',
hashed_storage_attachments_max_id: 'Highest ID present in attachments migrated to hashed storage',
repositories_checked_count: 'Number of repositories checked',
repositories_checked_failed_count: 'Number of failed repositories checked'
repositories_checked_failed_count: 'Number of failed repositories checked',
repositories_retrying_verification_count: 'Number of repositories verification failures that Geo is actively trying to correct on secondary',
wikis_retrying_verification_count: 'Number of wikis verification failures that Geo is actively trying to correct on secondary'
}.freeze
EXPIRATION_IN_MINUTES = 5
......@@ -238,6 +240,8 @@ class GeoNodeStatus < ActiveRecord::Base
self.wikis_verified_count = projects_finder.count_verified_wikis
self.wikis_verification_failed_count = projects_finder.count_verification_failed_wikis
self.wikis_checksum_mismatch_count = projects_finder.count_wikis_checksum_mismatch
self.repositories_retrying_verification_count = projects_finder.count_repositories_retrying_verification
self.wikis_retrying_verification_count = projects_finder.count_wikis_retrying_verification
end
end
......
---
title: Add Prometheus metrics to track Geo autocorrect numbers
merge_request: 6778
author:
type: added
# frozen_string_literal: true
class AddVerificationRetryCountsToGeoNodeStatuses < ActiveRecord::Migration
DOWNTIME = false
def change
add_column :geo_node_statuses, :repositories_retrying_verification_count, :integer
add_column :geo_node_statuses, :wikis_retrying_verification_count, :integer
end
end
......@@ -331,6 +331,9 @@ module EE
end
expose :wikis_checksum_mismatch_count
expose :repositories_retrying_verification_count
expose :wikis_retrying_verification_count
expose :replication_slots_count
expose :replication_slots_used_count
expose :replication_slots_used_in_percentage do |node|
......
......@@ -33,6 +33,8 @@ FactoryBot.define do
wikis_verified_count 499
wikis_verification_failed_count 99
wikis_checksum_mismatch_count 10
repositories_retrying_verification_count 25
wikis_retrying_verification_count 3
last_event_id 2
last_event_timestamp { Time.now.to_i }
cursor_last_event_id 1
......
......@@ -40,6 +40,8 @@
"wikis_verification_failed_count",
"wikis_verified_in_percentage",
"wikis_checksum_mismatch_count",
"repositories_retrying_verification_count",
"wikis_retrying_verification_count",
"repositories_checked_count",
"repositories_checked_failed_count",
"repositories_checked_in_percentage",
......@@ -103,6 +105,8 @@
"wikis_verification_failed_count": { "type": ["integer", "null"] },
"wikis_verified_in_percentage": { "type": "string" },
"wikis_checksum_mismatch_count": { "type": ["integer", "null"] },
"repositories_retrying_verification_count": { "type": ["integer", "null"] },
"wikis_retrying_verification_count": { "type": ["integer", "null"] },
"repositories_checked_count": { "type": ["integer", "null"] },
"repositories_checked_failed_count": { "type": ["integer", "null"] },
"repositories_checked_in_percentage": { "type": "string" },
......
......@@ -712,6 +712,27 @@ describe GeoNodeStatus, :geo do
end
end
describe '#repositories_retrying_verification_count' do
before do
stub_current_geo_node(secondary)
end
it 'returns the right number of repositories retrying verification' do
create(:geo_project_registry, :repository_verification_failed, repository_verification_retry_count: 1)
create(:geo_project_registry, :repository_verification_failed, repository_verification_retry_count: nil)
create(:geo_project_registry, :repository_verified)
expect(subject.repositories_retrying_verification_count).to eq(1)
end
it 'returns existing value when feature flag if off' do
allow(Gitlab::Geo).to receive(:repository_verification_enabled?).and_return(false)
create(:geo_node_status, :healthy, geo_node: secondary)
expect(subject.repositories_retrying_verification_count).to eq(25)
end
end
describe '#wikis_verified_count' do
before do
stub_current_geo_node(secondary)
......@@ -773,6 +794,27 @@ describe GeoNodeStatus, :geo do
end
end
describe '#wikis_retrying_verification_count' do
before do
stub_current_geo_node(secondary)
end
it 'returns the right number of wikis retrying verification' do
create(:geo_project_registry, :wiki_verification_failed, wiki_verification_retry_count: 1)
create(:geo_project_registry, :wiki_verification_failed, wiki_verification_retry_count: nil)
create(:geo_project_registry, :wiki_verified)
expect(subject.wikis_retrying_verification_count).to eq(1)
end
it 'returns existing value when feature flag if off' do
allow(Gitlab::Geo).to receive(:repository_verification_enabled?).and_return(false)
create(:geo_node_status, :healthy, geo_node: secondary)
expect(subject.wikis_retrying_verification_count).to eq(3)
end
end
describe '#last_event_id and #last_event_date' do
it 'returns nil when no events are available' do
expect(subject.last_event_id).to be_nil
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment