Commit 0d3512cc authored by Nick Thomas's avatar Nick Thomas

Merge branch '4179-geo-replication-status' into 'master'

Extend Geo Node Status with information in replication slots

See merge request gitlab-org/gitlab-ee!3621
parents df9800da 297dbc02
class AddGeoReplicationSlotStatus < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
def change
add_column :geo_node_statuses, :replication_slots_count, :integer
add_column :geo_node_statuses, :replication_slots_used_count, :integer
add_column :geo_node_statuses, :replication_slots_max_retained_wal_bytes, :integer
end
end
...@@ -970,6 +970,9 @@ ActiveRecord::Schema.define(version: 20171213160445) do ...@@ -970,6 +970,9 @@ ActiveRecord::Schema.define(version: 20171213160445) do
t.datetime "updated_at", null: false t.datetime "updated_at", null: false
t.datetime "last_successful_status_check_at" t.datetime "last_successful_status_check_at"
t.string "status_message" t.string "status_message"
t.integer "replication_slots_count"
t.integer "replication_slots_used_count"
t.integer "replication_slots_max_retained_wal_bytes"
t.integer "wikis_count" t.integer "wikis_count"
t.integer "wikis_synced_count" t.integer "wikis_synced_count"
t.integer "wikis_failed_count" t.integer "wikis_failed_count"
......
...@@ -134,6 +134,24 @@ class GeoNode < ActiveRecord::Base ...@@ -134,6 +134,24 @@ class GeoNode < ActiveRecord::Base
namespaces.exists? namespaces.exists?
end end
def replication_slots_count
return unless Gitlab::Database.replication_slots_supported? && primary?
PgReplicationSlot.count
end
def replication_slots_used_count
return unless Gitlab::Database.replication_slots_supported? && primary?
PgReplicationSlot.used_slots_count
end
def replication_slots_max_retained_wal_bytes
return unless Gitlab::Database.replication_slots_supported? && primary?
PgReplicationSlot.max_retained_wal
end
def find_or_build_status def find_or_build_status
status || build_status status || build_status
end end
......
...@@ -16,6 +16,9 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -16,6 +16,9 @@ class GeoNodeStatus < ActiveRecord::Base
attachments_count: 'Total number of file attachments available on primary', attachments_count: 'Total number of file attachments available on primary',
attachments_synced_count: 'Number of attachments synced on secondary', attachments_synced_count: 'Number of attachments synced on secondary',
attachments_failed_count: 'Number of attachments failed to sync on secondary', attachments_failed_count: 'Number of attachments failed to sync on secondary',
replication_slots_count: 'Total number of replication slots on the primary',
replication_slots_used_count: 'Number of replication slots in use on the primary',
replication_slots_max_retained_wal_bytes: 'Maximum number of bytes retained in the WAL on the primary',
last_event_id: 'Database ID of the latest event log entry on the primary', last_event_id: 'Database ID of the latest event log entry on the primary',
last_event_timestamp: 'Time of the latest event log entry on the primary', last_event_timestamp: 'Time of the latest event log entry on the primary',
cursor_last_event_id: 'Last database ID of the event log processed by the secondary', cursor_last_event_id: 'Last database ID of the event log processed by the secondary',
...@@ -68,6 +71,12 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -68,6 +71,12 @@ class GeoNodeStatus < ActiveRecord::Base
self.attachments_count = attachments_finder.count_attachments self.attachments_count = attachments_finder.count_attachments
self.last_successful_status_check_at = Time.now self.last_successful_status_check_at = Time.now
if Gitlab::Geo.primary?
self.replication_slots_count = geo_node.replication_slots_count
self.replication_slots_used_count = geo_node.replication_slots_used_count
self.replication_slots_max_retained_wal_bytes = geo_node.replication_slots_max_retained_wal_bytes
end
if Gitlab::Geo.secondary? if Gitlab::Geo.secondary?
self.db_replication_lag_seconds = Gitlab::Geo::HealthCheck.db_replication_lag_seconds self.db_replication_lag_seconds = Gitlab::Geo::HealthCheck.db_replication_lag_seconds
self.cursor_last_event_id = Geo::EventLogState.last_processed&.event_id self.cursor_last_event_id = Geo::EventLogState.last_processed&.event_id
...@@ -120,19 +129,23 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -120,19 +129,23 @@ class GeoNodeStatus < ActiveRecord::Base
end end
def repositories_synced_in_percentage def repositories_synced_in_percentage
sync_percentage(repositories_count, repositories_synced_count) calc_percentage(repositories_count, repositories_synced_count)
end end
def wikis_synced_in_percentage def wikis_synced_in_percentage
sync_percentage(wikis_count, wikis_synced_count) calc_percentage(wikis_count, wikis_synced_count)
end end
def lfs_objects_synced_in_percentage def lfs_objects_synced_in_percentage
sync_percentage(lfs_objects_count, lfs_objects_synced_count) calc_percentage(lfs_objects_count, lfs_objects_synced_count)
end end
def attachments_synced_in_percentage def attachments_synced_in_percentage
sync_percentage(attachments_count, attachments_synced_count) calc_percentage(attachments_count, attachments_synced_count)
end
def replication_slots_used_in_percentage
calc_percentage(replication_slots_count, replication_slots_used_count)
end end
def [](key) def [](key)
...@@ -153,9 +166,9 @@ class GeoNodeStatus < ActiveRecord::Base ...@@ -153,9 +166,9 @@ class GeoNodeStatus < ActiveRecord::Base
@projects_finder ||= Geo::ProjectRegistryFinder.new(current_node: geo_node) @projects_finder ||= Geo::ProjectRegistryFinder.new(current_node: geo_node)
end end
def sync_percentage(total, synced) def calc_percentage(total, count)
return 0 if !total.present? || total.zero? return 0 if !total.present? || total.zero?
(synced.to_f / total.to_f) * 100.0 (count.to_f / total.to_f) * 100.0
end end
end end
# `pg_replication_slots` is a PostgreSQL view
class PgReplicationSlot
def self.count
ActiveRecord::Base.connection.execute("SELECT COUNT(*) FROM pg_replication_slots;")
.first.fetch('count').to_i
end
def self.unused_slots_count
ActiveRecord::Base.connection.execute("SELECT COUNT(*) FROM pg_replication_slots WHERE active = 'f';")
.first.fetch('count').to_i
end
def self.used_slots_count
ActiveRecord::Base.connection.execute("SELECT COUNT(*) FROM pg_replication_slots WHERE active = 't';")
.first.fetch('count').to_i
end
# array of slots and the retained_bytes
# https://www.skillslogic.com/blog/databases/checking-postgres-replication-lag
# http://bdr-project.org/docs/stable/monitoring-peers.html
def self.slots_retained_bytes
ActiveRecord::Base.connection.execute(<<-SQL.squish)
SELECT slot_name, database, active, pg_xlog_location_diff(pg_current_xlog_insert_location(), restart_lsn)
AS retained_bytes
FROM pg_replication_slots;
SQL
.to_a
end
# returns the max number WAL space (in bytes) being used across the replication slots
def self.max_retained_wal
ActiveRecord::Base.connection.execute(<<-SQL.squish)
SELECT COALESCE(MAX(pg_xlog_location_diff(pg_current_xlog_insert_location(), restart_lsn)), 0)
FROM pg_replication_slots;
SQL
.first.fetch('coalesce').to_i
end
def self.max_replication_slots
ActiveRecord::Base.connection.execute(<<-SQL.squish)
SELECT setting FROM pg_settings WHERE name = 'max_replication_slots';
SQL
.first&.fetch('setting').to_i
end
end
...@@ -39,6 +39,13 @@ class GeoNodeStatusEntity < Grape::Entity ...@@ -39,6 +39,13 @@ class GeoNodeStatusEntity < Grape::Entity
number_to_percentage(node.wikis_synced_in_percentage, precision: 2) number_to_percentage(node.wikis_synced_in_percentage, precision: 2)
end end
expose :replication_slots_count
expose :replication_slots_used_count
expose :replication_slots_used_in_percentage do |node|
number_to_percentage(node.replication_slots_used_in_percentage, precision: 2)
end
expose :replication_slots_max_retained_wal_bytes
expose :last_event_id expose :last_event_id
expose :last_event_timestamp expose :last_event_timestamp
expose :cursor_last_event_id expose :cursor_last_event_id
......
...@@ -288,6 +288,44 @@ describe GeoNodeStatus, :geo do ...@@ -288,6 +288,44 @@ describe GeoNodeStatus, :geo do
end end
end end
describe '#replication_slots_used_count' do
it 'returns the right number of used replication slots' do
stub_current_geo_node(primary)
allow(primary).to receive(:replication_slots_used_count).and_return(1)
expect(subject.replication_slots_used_count).to eq(1)
end
end
describe '#replication_slots_used_in_percentage' do
it 'returns 0 when no replication slots are available' do
expect(subject.replication_slots_used_in_percentage).to eq(0)
end
it 'returns 0 when replication slot count is unknown' do
allow(subject).to receive(:replication_slot_count).and_return(nil)
expect(subject.replication_slots_used_in_percentage).to eq(0)
end
it 'returns the right percentage' do
stub_current_geo_node(primary)
allow(subject).to receive(:replication_slots_count).and_return(2)
allow(subject).to receive(:replication_slots_used_count).and_return(1)
expect(subject.replication_slots_used_in_percentage).to be_within(0.0001).of(50)
end
end
describe '#replication_slots_max_retained_wal_bytes' do
it 'returns the number of bytes replication slots are using' do
stub_current_geo_node(primary)
allow(primary).to receive(:replication_slots_max_retained_wal_bytes).and_return(2.megabytes)
expect(subject.replication_slots_max_retained_wal_bytes).to eq(2.megabytes)
end
end
describe '#last_event_id and #last_event_date' do describe '#last_event_id and #last_event_date' do
it 'returns nil when no events are available' do it 'returns nil when no events are available' do
expect(subject.last_event_id).to be_nil expect(subject.last_event_id).to be_nil
......
require 'spec_helper'
describe PgReplicationSlot, :postgresql do
if Gitlab::Database.replication_slots_supported?
describe 'with replication slot support' do
it '#max_replication_slots' do
expect(described_class.max_replication_slots).to be >= 0
end
skip = PgReplicationSlot.max_replication_slots <= PgReplicationSlot.count
context 'with enough slots available', skip: (skip ? 'max_replication_slots too small' : nil) do
before(:all) do
@current_slot_count =
ActiveRecord::Base.connection.execute("SELECT COUNT(*) FROM pg_replication_slots;")
.first.fetch('count').to_i
@current_unused_count =
ActiveRecord::Base.connection.execute("SELECT COUNT(*) FROM pg_replication_slots WHERE active = 'f';")
.first.fetch('count').to_i
ActiveRecord::Base.connection.execute("SELECT * FROM pg_create_physical_replication_slot('test_slot');")
end
after(:all) do
ActiveRecord::Base.connection.execute("SELECT pg_drop_replication_slot('test_slot');")
end
it '#slots_count' do
expect(described_class.count).to eq(@current_slot_count + 1)
end
it '#unused_slots_count' do
expect(described_class.unused_slots_count).to eq(@current_unused_count + 1)
end
it '#max_retained_wal' do
expect(PgReplicationSlot.max_retained_wal).not_to be_nil
end
it '#slots_retained_bytes' do
slot = PgReplicationSlot.slots_retained_bytes.find {|x| x['slot_name'] == 'test_slot' }
expect(slot).not_to be_nil
expect(slot['retained_bytes']).to be_nil
end
end
end
end
end
...@@ -26,6 +26,10 @@ describe GeoNodeStatusEntity, :postgresql do ...@@ -26,6 +26,10 @@ describe GeoNodeStatusEntity, :postgresql do
it { is_expected.to have_key(:wikis_failed_count) } it { is_expected.to have_key(:wikis_failed_count) }
it { is_expected.to have_key(:wikis_synced_count)} it { is_expected.to have_key(:wikis_synced_count)}
it { is_expected.to have_key(:wikis_synced_in_percentage) } it { is_expected.to have_key(:wikis_synced_in_percentage) }
it { is_expected.to have_key(:replication_slots_count) }
it { is_expected.to have_key(:replication_slots_used_count)}
it { is_expected.to have_key(:replication_slots_used_in_percentage) }
it { is_expected.to have_key(:replication_slots_max_retained_wal_bytes) }
it { is_expected.to have_key(:last_successful_status_check_timestamp) } it { is_expected.to have_key(:last_successful_status_check_timestamp) }
it { is_expected.to have_key(:namespaces) } it { is_expected.to have_key(:namespaces) }
...@@ -99,6 +103,15 @@ describe GeoNodeStatusEntity, :postgresql do ...@@ -99,6 +103,15 @@ describe GeoNodeStatusEntity, :postgresql do
end end
end end
describe '#replication_slots_used_in_percentage' do
it 'formats as percentage' do
geo_node_status.assign_attributes(replication_slots_count: 4,
replication_slots_used_count: 2)
expect(subject[:replication_slots_used_in_percentage]).to eq '50.00%'
end
end
describe '#namespaces' do describe '#namespaces' do
it 'returns empty array when full sync is active' do it 'returns empty array when full sync is active' do
expect(subject[:namespaces]).to be_empty expect(subject[:namespaces]).to be_empty
......
...@@ -18,6 +18,10 @@ ...@@ -18,6 +18,10 @@
"wikis_count", "wikis_count",
"wikis_failed_count", "wikis_failed_count",
"wikis_synced_count", "wikis_synced_count",
"replication_slots_count",
"replication_slots_used_count",
"replication_slots_used_in_percentage",
"replication_slots_max_retained_wal_bytes",
"last_event_id", "last_event_id",
"last_event_timestamp", "last_event_timestamp",
"cursor_last_event_id", "cursor_last_event_id",
...@@ -48,6 +52,10 @@ ...@@ -48,6 +52,10 @@
"wikis_failed_count": { "type": "integer" }, "wikis_failed_count": { "type": "integer" },
"wikis_synced_count": { "type": "integer" }, "wikis_synced_count": { "type": "integer" },
"wikis_synced_in_percentage": { "type": "string" }, "wikis_synced_in_percentage": { "type": "string" },
"replication_slots_count": { "type": ["integer", "null"] },
"replication_slots_used_count": { "type": ["integer", "null"] },
"replication_slots_used_in_percentage": { "type": "string" },
"replication_slots_max_retained_wal_bytes": { "type": ["integer", "null"] },
"last_event_id": { "type": ["integer", "null"] }, "last_event_id": { "type": ["integer", "null"] },
"last_event_timestamp": { "type": ["integer", "null"] }, "last_event_timestamp": { "type": ["integer", "null"] },
"cursor_last_event_id": { "type": ["integer", "null"] }, "cursor_last_event_id": { "type": ["integer", "null"] },
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment