Merge branch 'load-balancing-handle-outdated-replicas' into 'master'

Handle outdated replicas in the DB load balancer Closes #2197 See merge request gitlab-org/gitlab-ee!3526

Merge branch 'load-balancing-handle-outdated-replicas' into 'master'
Handle outdated replicas in the DB load balancer Closes #2197 See merge request gitlab-org/gitlab-ee!3526
689b3119 · Douwe Maan · 8c43fa1f · a2867b1b · 689b3119 · 689b3119
Commit 689b3119 authored Nov 30, 2017 by Douwe Maan
8 changed files
--- a/changelogs/unreleased-ee/load-balancing-handle-outdated-replicas.yml
+++ b/changelogs/unreleased-ee/load-balancing-handle-outdated-replicas.yml
+---
+title: Handle outdated replicas in the DB load balancer
+merge_request:
+author:
+type: added
--- a/doc/administration/database_load_balancing.md
+++ b/doc/administration/database_load_balancing.md
@@ -156,6 +156,40 @@ log entries easier. For example:
 [DB-LB] Host 10.123.2.7 came back online
 ```
+## Handling Stale Reads
+> [Introduced][ee-3526] in [GitLab Enterprise Edition Premium][eep] 10.3.
+To prevent reading from an outdated secondary the load balancer will check if it
+is in sync with the primary. If the data is determined to be recent enough the
+secondary can be used, otherwise it will be ignored. To reduce the overhead of
+these checks we only perform these checks at certain intervals.
+There are three configuration options that influence this behaviour:
+| Option                       | Description                                                                                                    | Default    |
+|------------------------------|----------------------------------------------------------------------------------------------------------------|------------|
+| `max_replication_difference` | The amount of data (in bytes) a secondary is allowed to lag behind when it hasn't replicated data for a while. | 8 MB       |
+| `max_replication_lag_time`   | The maximum number of seconds a secondary is allowed to lag behind before we stop using it.                    | 60 seconds |
+| `replica_check_interval`     | The minimum number of seconds we have to wait before checking the status of a secondary.                       | 60 seconds |
+The defaults should be sufficient for most users. Should you want to change them
+you can specify them in `config/database.yml` like so:
+```yaml
+production:
+  username: gitlab
+  database: gitlab
+  encoding: unicode
+  load_balancing:
+    hosts:
+      - host1.example.com
+      - host2.example.com
+    max_replication_difference: 16777216 # 16 MB
+    max_replication_lag_time: 30
+    replica_check_interval: 30
+```
 [hot-standby]: https://www.postgresql.org/docs/9.6/static/hot-standby.html
 [ee-1283]: https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/1283
 [eep]: https://about.gitlab.com/gitlab-ee/
@@ -163,3 +197,4 @@ log entries easier. For example:
 [restart gitlab]: restart_gitlab.md#installations-from-source "How to restart GitLab"
 [wikipedia]: https://en.wikipedia.org/wiki/Load_balancing_(computing)
 [db-req]: ../install/requirements.md#database
+[ee-3526]: https://gitlab.com/gitlab-org/gitlab-ee/merge_requests/3526
--- a/lib/gitlab/database/load_balancing.rb
+++ b/lib/gitlab/database/load_balancing.rb
@@ -24,15 +24,30 @@ module Gitlab
                            [].freeze
                          end
+      # Returns a Hash containing the load balancing configuration.
+      def self.configuration
+        ActiveRecord::Base.configurations[Rails.env]['load_balancing'] || {}
+      end
+      # Returns the maximum replica lag size in bytes.
+      def self.max_replication_difference
+        (configuration['max_replication_difference'] || 8.megabytes).to_i
+      end
+      # Returns the maximum lag time for a replica.
+      def self.max_replication_lag_time
+        (configuration['max_replication_lag_time'] || 60.0).to_f
+      end
+      # Returns the interval (in seconds) to use for checking the status of a
+      # replica.
+      def self.replica_check_interval
+        (configuration['replica_check_interval'] || 60).to_f
+      end
      # Returns the additional hosts to use for load balancing.
      def self.hosts
-        hash = ActiveRecord::Base.configurations[Rails.env]['load_balancing']
+        configuration['hosts'] || []
-        if hash
-          hash['hosts'] || []
-        else
-          []
-        end
      end
      def self.log(level, message)

--- a/lib/gitlab/database/load_balancing/host.rb
+++ b/lib/gitlab/database/load_balancing/host.rb
@@ -3,15 +3,21 @@ module Gitlab
    module LoadBalancing
      # A single database host used for load balancing.
      class Host
-        attr_reader :pool
+        attr_reader :pool, :last_checked_at, :intervals, :load_balancer
        delegate :connection, :release_connection, to: :pool
        # host - The address of the database.
-        def initialize(host)
+        # load_balancer - The LoadBalancer that manages this Host.
+        def initialize(host, load_balancer)
          @host = host
+          @load_balancer = load_balancer
          @pool = Database.create_connection_pool(LoadBalancing.pool_size, host)
          @online = true
+          @last_checked_at = Time.zone.now
+          interval = LoadBalancing.replica_check_interval
+          @intervals = (interval..(interval * 2)).step(0.5).to_a
        end
        def offline!
@@ -23,30 +29,80 @@ module Gitlab
        # Returns true if the host is online.
        def online?
-          return true if @online
+          return @online unless check_replica_status?
-          begin
+          refresh_status
-            retried = 0
-            @online = begin
+          LoadBalancing.log(:info, "Host #{@host} came back online") if @online
-                        connection.active?
-                      rescue
+          @online
-                        if retried < 3
+        end
-                          release_connection
-                          retried += 1
+        def refresh_status
-                          retry
+          @online = replica_is_up_to_date?
-                        else
+          @last_checked_at = Time.zone.now
-                          false
+        end
-                        end
-                      end
+        def check_replica_status?
+          (Time.zone.now - last_checked_at) >= intervals.sample
-            LoadBalancing.log(:info, "Host #{@host} came back online") if @online
+        end
-            @online
+        def replica_is_up_to_date?
-          ensure
+          replication_lag_below_threshold? || data_is_recent_enough?
-            release_connection
+        end
+        def replication_lag_below_threshold?
+          if (lag_time = replication_lag_time)
+            lag_time <= LoadBalancing.max_replication_lag_time
+          else
+            false
+          end
+        end
+        # Returns true if the replica has replicated enough data to be useful.
+        def data_is_recent_enough?
+          # It's possible for a replica to not replay WAL data for a while,
+          # despite being up to date. This can happen when a primary does not
+          # receive any writes a for a while.
+          #
+          # To prevent this from happening we check if the lag size (in bytes)
+          # of the replica is small enough for the replica to be useful. We
+          # only do this if we haven't replicated in a while so we only need
+          # to connect to the primary when truly necessary.
+          if (lag_size = replication_lag_size)
+            lag_size <= LoadBalancing.max_replication_difference
+          else
+            false
          end
        end
+        # Returns the replication lag time of this secondary in seconds as a
+        # float.
+        #
+        # This method will return nil if no lag time could be calculated.
+        def replication_lag_time
+          row = query_and_release('SELECT EXTRACT(EPOCH FROM (now() - pg_last_xact_replay_timestamp()))::float as lag')
+          row['lag'].to_f if row.any?
+        end
+        # Returns the number of bytes this secondary is lagging behind the
+        # primary.
+        #
+        # This method will return nil if no lag size could be calculated.
+        def replication_lag_size
+          location = connection.quote(primary_write_location)
+          row = query_and_release("SELECT pg_xlog_location_diff(#{location}, pg_last_xlog_replay_location())::float AS diff")
+          row['diff'].to_i if row.any?
+        end
+        def primary_write_location
+          load_balancer.primary_write_location
+        ensure
+          load_balancer.release_primary_connection
+        end
        # Returns true if this host has caught up to the given transaction
        # write location.
        #
@@ -60,9 +116,15 @@ module Gitlab
          query = "SELECT NOT pg_is_in_recovery() OR " \
            "pg_xlog_location_diff(pg_last_xlog_replay_location(), #{string}) >= 0 AS result"
-          row = connection.select_all(query).first
+          row = query_and_release(query)
+          row['result'] == 't'
+        end
-          row && row['result'] == 't'
+        def query_and_release(sql)
+          connection.select_all(sql).first || {}
+        rescue
+          {}
        ensure
          release_connection
        end

--- a/lib/gitlab/database/load_balancing/load_balancer.rb
+++ b/lib/gitlab/database/load_balancing/load_balancer.rb
@@ -15,7 +15,7 @@ module Gitlab
        # hosts - The hostnames/addresses of the additional databases.
        def initialize(hosts = [])
-          @host_list = HostList.new(hosts.map { |addr| Host.new(addr) })
+          @host_list = HostList.new(hosts.map { |addr| Host.new(addr, self) })
        end
        # Yields a connection that can be used for reads.

--- a/spec/lib/gitlab/database/load_balancing/host_list_spec.rb
+++ b/spec/lib/gitlab/database/load_balancing/host_list_spec.rb
@@ -2,13 +2,16 @@ require 'spec_helper'
 describe Gitlab::Database::LoadBalancing::HostList do
  before do
-    allow(Gitlab::Database).to receive(:create_connection_pool)
+    allow(Gitlab::Database)
+      .to receive(:create_connection_pool)
      .and_return(ActiveRecord::Base.connection_pool)
  end
+  let(:load_balancer) { double(:load_balancer) }
  let(:host_list) do
    hosts = Array.new(2) do
-      Gitlab::Database::LoadBalancing::Host.new('localhost')
+      Gitlab::Database::LoadBalancing::Host.new('localhost', load_balancer)
    end
    described_class.new(hosts)

--- a/spec/lib/gitlab/database/load_balancing/host_spec.rb
+++ b/spec/lib/gitlab/database/load_balancing/host_spec.rb
 require 'spec_helper'
-describe Gitlab::Database::LoadBalancing::Host do
+describe Gitlab::Database::LoadBalancing::Host, :postgresql do
-  let(:host) { described_class.new('localhost') }
+  let(:load_balancer) do
+    Gitlab::Database::LoadBalancing::LoadBalancer.new(%w[localhost])
+  end
+  let(:host) { load_balancer.host_list.hosts.first }
  before do
    allow(Gitlab::Database).to receive(:create_connection_pool)
@@ -33,68 +37,186 @@ describe Gitlab::Database::LoadBalancing::Host do
  end
  describe '#online?' do
-    let(:error) { Class.new(RuntimeError) }
+    context 'when the replica status is recent enough' do
+      it 'returns the latest status' do
+        Timecop.freeze do
+          host = described_class.new('localhost', load_balancer)
-    before do
+          expect(host).not_to receive(:refresh_status)
-      allow(host.pool).to receive(:disconnect!)
+          expect(host).to be_online
+        end
+      end
    end
-    it 'returns true when the host is online' do
+    context 'when the replica status is outdated' do
-      expect(host).not_to receive(:connection)
+      it 'refreshes the status' do
-      expect(host).not_to receive(:release_connection)
+        host.offline!
-      expect(host.online?).to eq(true)
+        expect(host)
+          .to receive(:check_replica_status?)
+          .and_return(true)
+        expect(host).to be_online
+      end
    end
+  end
-    it 'returns true when the host was marked as offline but is online again' do
+  describe '#refresh_status' do
-      connection = double(:connection, active?: true)
+    it 'refreshes the status' do
+      host.offline!
-      allow(host).to receive(:connection).and_return(connection)
+      expect(host)
+        .to receive(:replica_is_up_to_date?)
+        .and_call_original
-      host.offline!
+      host.refresh_status
-      expect(host).to receive(:release_connection)
+      expect(host).to be_online
-      expect(host.online?).to eq(true)
    end
+  end
-    it 'returns false when the host is offline' do
+  describe '#check_replica_status?' do
-      connection = double(:connection, active?: false)
+    it 'returns true when we need to check the replica status' do
+      allow(host)
+        .to receive(:last_checked_at)
+        .and_return(1.year.ago)
-      allow(host).to receive(:connection).and_return(connection)
+      expect(host.check_replica_status?).to eq(true)
-      expect(host).to receive(:release_connection)
+    end
-      host.offline!
+    it 'returns false when we do not need to check the replica status' do
+      Timecop.freeze do
+        allow(host)
+          .to receive(:last_checked_at)
+          .and_return(Time.zone.now)
-      expect(host.online?).to eq(false)
+        expect(host.check_replica_status?).to eq(false)
+      end
    end
+  end
-    it 'returns false when a connection could not be established' do
+  describe '#replica_is_up_to_date?' do
-      expect(host).to receive(:connection).exactly(4).times.and_raise(error)
+    context 'when the lag time is below the threshold' do
-      expect(host).to receive(:release_connection).exactly(4).times
+      it 'returns true' do
+        expect(host)
+          .to receive(:replication_lag_below_threshold?)
+          .and_return(true)
-      host.offline!
+        expect(host.replica_is_up_to_date?).to eq(true)
+      end
-      expect(host.online?).to eq(false)
    end
-    it 'retries when a connection error is thrown' do
+    context 'when the lag time exceeds the threshold' do
-      connection = double(:connection, active?: true)
+      before do
-      raised = false
+        allow(host)
+          .to receive(:replication_lag_below_threshold?)
+          .and_return(false)
+      end
-      allow(host).to receive(:connection) do
+      it 'returns true if the data is recent enough' do
-        unless raised
+        expect(host)
-          raised = true
+          .to receive(:data_is_recent_enough?)
-          raise error.new
+          .and_return(true)
-        end
-        connection
+        expect(host.replica_is_up_to_date?).to eq(true)
      end
-      expect(host).to receive(:release_connection).twice
+      it 'returns false when the data is not recent enough' do
+        expect(host)
+          .to receive(:data_is_recent_enough?)
+          .and_return(false)
-      host.offline!
+        expect(host.replica_is_up_to_date?).to eq(false)
+      end
+    end
+  end
+  describe '#replication_lag_below_threshold' do
+    it 'returns true when the lag time is below the threshold' do
+      expect(host)
+        .to receive(:replication_lag_time)
+        .and_return(1)
-      expect(host.online?).to eq(true)
+      expect(host.replication_lag_below_threshold?).to eq(true)
+    end
+    it 'returns false when the lag time exceeds the threshold' do
+      expect(host)
+        .to receive(:replication_lag_time)
+        .and_return(9000)
+      expect(host.replication_lag_below_threshold?).to eq(false)
+    end
+    it 'returns false when no lag time could be calculated' do
+      expect(host)
+        .to receive(:replication_lag_time)
+        .and_return(nil)
+      expect(host.replication_lag_below_threshold?).to eq(false)
+    end
+  end
+  describe '#data_is_recent_enough?' do
+    it 'returns true when the data is recent enough' do
+      expect(host.data_is_recent_enough?).to eq(true)
+    end
+    it 'returns false when the data is not recent enough' do
+      diff = Gitlab::Database::LoadBalancing.max_replication_difference * 2
+      expect(host)
+        .to receive(:query_and_release)
+        .and_return({ 'diff' => diff })
+      expect(host.data_is_recent_enough?).to eq(false)
+    end
+    it 'returns false when no lag size could be calculated' do
+      expect(host)
+        .to receive(:replication_lag_size)
+        .and_return(nil)
+      expect(host.data_is_recent_enough?).to eq(false)
+    end
+  end
+  describe '#replication_lag_time' do
+    it 'returns the lag time as a Float' do
+      expect(host.replication_lag_time).to be_an_instance_of(Float)
+    end
+    it 'returns nil when the database query returned no rows' do
+      expect(host)
+        .to receive(:query_and_release)
+        .and_return({})
+      expect(host.replication_lag_time).to be_nil
+    end
+  end
+  describe '#replication_lag_size' do
+    it 'returns the lag size as an Integer' do
+      # On newer versions of Ruby the class is Integer, but on CI we run a
+      # version that still uses Fixnum.
+      classes = [Fixnum, Integer] # rubocop: disable Lint/UnifiedInteger
+      expect(classes).to include(host.replication_lag_size.class)
+    end
+    it 'returns nil when the database query returned no rows' do
+      expect(host)
+        .to receive(:query_and_release)
+        .and_return({})
+      expect(host.replication_lag_size).to be_nil
+    end
+  end
+  describe '#primary_write_location' do
+    it 'returns the write location of the primary' do
+      expect(host.primary_write_location).to be_an_instance_of(String)
+      expect(host.primary_write_location).not_to be_empty
    end
  end
@@ -119,4 +241,29 @@ describe Gitlab::Database::LoadBalancing::Host do
      expect(host.caught_up?('foo')).to eq(false)
    end
  end
+  describe '#query_and_release' do
+    it 'executes a SQL query' do
+      results = host.query_and_release('SELECT 10 AS number')
+      expect(results).to be_an_instance_of(Hash)
+      expect(results['number'].to_i).to eq(10)
+    end
+    it 'releases the connection after running the query' do
+      expect(host)
+        .to receive(:release_connection)
+        .once
+      host.query_and_release('SELECT 10 AS number')
+    end
+    it 'returns an empty Hash in the event of an error' do
+      expect(host.connection)
+        .to receive(:select_all)
+        .and_raise(RuntimeError, 'kittens')
+      expect(host.query_and_release('SELECT 10 AS number')).to eq({})
+    end
+  end
 end
--- a/spec/lib/gitlab/database/load_balancing_spec.rb
+++ b/spec/lib/gitlab/database/load_balancing_spec.rb
@@ -9,10 +9,89 @@ describe Gitlab::Database::LoadBalancing do
    end
  end
+  describe '.configuration' do
+    it 'returns a Hash' do
+      config = { 'hosts' => %w(foo) }
+      allow(ActiveRecord::Base.configurations[Rails.env])
+        .to receive(:[])
+        .with('load_balancing')
+        .and_return(config)
+      expect(described_class.configuration).to eq(config)
+    end
+  end
+  describe '.max_replication_difference' do
+    context 'without an explicitly configured value' do
+      it 'returns the default value' do
+        allow(described_class)
+          .to receive(:configuration)
+          .and_return({})
+        expect(described_class.max_replication_difference).to eq(8.megabytes)
+      end
+    end
+    context 'with an explicitly configured value' do
+      it 'returns the configured value' do
+        allow(described_class)
+          .to receive(:configuration)
+          .and_return({ 'max_replication_difference' => 4 })
+        expect(described_class.max_replication_difference).to eq(4)
+      end
+    end
+  end
+  describe '.max_replication_lag_time' do
+    context 'without an explicitly configured value' do
+      it 'returns the default value' do
+        allow(described_class)
+          .to receive(:configuration)
+          .and_return({})
+        expect(described_class.max_replication_lag_time).to eq(60)
+      end
+    end
+    context 'with an explicitly configured value' do
+      it 'returns the configured value' do
+        allow(described_class)
+          .to receive(:configuration)
+          .and_return({ 'max_replication_lag_time' => 4 })
+        expect(described_class.max_replication_lag_time).to eq(4)
+      end
+    end
+  end
+  describe '.replica_check_interval' do
+    context 'without an explicitly configured value' do
+      it 'returns the default value' do
+        allow(described_class)
+          .to receive(:configuration)
+          .and_return({})
+        expect(described_class.replica_check_interval).to eq(60)
+      end
+    end
+    context 'with an explicitly configured value' do
+      it 'returns the configured value' do
+        allow(described_class)
+          .to receive(:configuration)
+          .and_return({ 'replica_check_interval' => 4 })
+        expect(described_class.replica_check_interval).to eq(4)
+      end
+    end
+  end
  describe '.hosts' do
    it 'returns a list of hosts' do
-      allow(ActiveRecord::Base.configurations[Rails.env]).to receive(:[])
+      allow(described_class)
-        .with('load_balancing')
+        .to receive(:configuration)
        .and_return({ 'hosts' => %w(foo bar baz) })
      expect(described_class.hosts).to eq(%w(foo bar baz))