Updating Gitaly Cluster E2E test logic

b9839a17 · John McDonnell · Mark Lapierre · 4fe41b1e · b9839a17 · b9839a17
Commit b9839a17 authored Oct 12, 2021 by John McDonnell Committed by Mark Lapierre Oct 12, 2021
4 changed files
--- a/qa/qa/service/praefect_manager.rb
+++ b/qa/qa/service/praefect_manager.rb
@@ -46,6 +46,10 @@ module QA
        end
      end

+      def stop_primary_node
+        stop_node(@primary_node)
+      end
+
      def start_primary_node
        start_node(@primary_node)
      end
@@ -66,20 +70,29 @@ module QA
        start_node(@secondary_node)
      end

+      def stop_tertiary_node
+        stop_node(@tertiary_node)
+      end
+
+      def start_tertiary_node
+        start_node(@tertiary_node)
+      end
+
      def start_node(name)
        shell "docker start #{name}"
+        wait_until_shell_command_matches(
+          "docker inspect -f {{.State.Running}} #{name}",
+          /true/,
+          sleep_interval: 3,
+          max_duration: 180,
+          retry_on_exception: true
+        )
      end

      def stop_node(name)
        shell "docker stop #{name}"
      end

-      def trigger_failover_by_stopping_primary_node
-        QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
-        stop_node(@primary_node)
-        wait_for_new_primary
-      end
-
      def clear_replication_queue
        QA::Runtime::Logger.info("Clearing the replication queue")
        shell sql_to_docker_exec_cmd(
@@ -157,22 +170,8 @@ module QA
        result[2].to_i
      end

-      # Makes the original primary (gitaly1) the primary again by
-      # stopping the other nodes, waiting for gitaly1 to be made the
-      # primary again, and then it starts the other nodes and enables
-      # writes
-      def reset_primary_to_original
-        QA::Runtime::Logger.info("Checking primary node...")
-
-        return if @primary_node == current_primary_node
-
-        QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
+      def start_all_nodes
        start_node(@primary_node)
-        stop_node(@secondary_node)
-        stop_node(@tertiary_node)
-
-        wait_for_new_primary_node(@primary_node)
-
        start_node(@secondary_node)
        start_node(@tertiary_node)

@@ -189,10 +188,12 @@ module QA
      end

      def wait_for_praefect
-        QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
        wait_until_shell_command_matches(
-          "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
-          /listening at tcp address/
+          "docker inspect -f {{.State.Running}} #{@praefect}",
+          /true/,
+          sleep_interval: 3,
+          max_duration: 180,
+          retry_on_exception: true
        )

        # Praefect can fail to start if unable to dial one of the gitaly nodes
@@ -204,20 +205,6 @@ module QA
        end
      end

-      def wait_for_new_primary_node(node)
-        QA::Runtime::Logger.info("Wait until #{node} is the primary node")
-        with_praefect_log(max_duration: 120) do |log|
-          break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
-        end
-      end
-
-      def wait_for_new_primary
-        QA::Runtime::Logger.info("Wait until a new primary node is selected")
-        with_praefect_log(max_duration: 120) do |log|
-          break true if log['msg'] == 'primary node changed'
-        end
-      end
-
      def wait_for_sql_ping
        wait_until_shell_command_matches(
          "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'",
@@ -274,10 +261,6 @@ module QA
        end
      end

-      def wait_for_health_check_current_primary_node
-        wait_for_health_check(current_primary_node)
-      end
-
      def wait_for_health_check_all_nodes
        wait_for_health_check(@primary_node)
        wait_for_health_check(@secondary_node)
@@ -286,29 +269,58 @@ module QA

      def wait_for_health_check(node)
        QA::Runtime::Logger.info("Waiting for health check on #{node}")
-        wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
-          QA::Runtime::Logger.debug(line.chomp)
-          log = JSON.parse(line)
+        wait_until_node_is_marked_as_healthy_storage(node)
+      end

-          log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
-        rescue JSON::ParserError
-          # Ignore lines that can't be parsed as JSON
+      def wait_for_primary_node_health_check
+        wait_for_health_check(@primary_node)
+      end
+
+      def wait_for_secondary_node_health_check
+        wait_for_health_check(@secondary_node)
      end
+
+      def wait_for_tertiary_node_health_check
+        wait_for_health_check(@tertiary_node)
+      end
+
+      def wait_for_health_check_failure(node)
+        QA::Runtime::Logger.info("Waiting for health check failure on #{node}")
+        wait_until_node_is_removed_from_healthy_storages(node)
+      end
+
+      def wait_for_primary_node_health_check_failure
+        wait_for_health_check_failure(@primary_node)
      end

      def wait_for_secondary_node_health_check_failure
        wait_for_health_check_failure(@secondary_node)
      end

-      def wait_for_health_check_failure(node)
-        QA::Runtime::Logger.info("Waiting for Praefect to record a health check failure on #{node}")
-        wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
-          QA::Runtime::Logger.debug(line.chomp)
-          log = JSON.parse(line)
+      def wait_for_tertiary_node_health_check_failure
+        wait_for_health_check_failure(@tertiary_node)
+      end

-          health_check_failure_message?(log['msg']) && log['storage'] == node
-        rescue JSON::ParserError
-          # Ignore lines that can't be parsed as JSON
+      def wait_until_node_is_removed_from_healthy_storages(node)
+        Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
+          result = []
+          shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
+            result << line
+          end
+          QA::Runtime::Logger.debug("result is ---#{result}")
+          result[2].to_i == 0
+        end
+      end
+
+      def wait_until_node_is_marked_as_healthy_storage(node)
+        Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
+          result = []
+          shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
+            result << line
+          end
+
+          QA::Runtime::Logger.debug("result is ---#{result}")
+          result[2].to_i == 1
        end
      end


--- a/qa/qa/service/shellout.rb
+++ b/qa/qa/service/shellout.rb
@@ -52,7 +52,7 @@ module QA
      end

      def wait_until_shell_command_matches(cmd, regex, **kwargs)
-        wait_until_shell_command(cmd, kwargs) do |line|
+        wait_until_shell_command(cmd, **kwargs) do |line|
          QA::Runtime::Logger.debug(line.chomp)

          line =~ regex

--- a/qa/qa/specs/features/api/3_create/gitaly/automatic_failover_and_recovery_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/automatic_failover_and_recovery_spec.rb
@@ -14,7 +14,7 @@ module QA

      before(:context) do
        # Reset the cluster in case previous tests left it in a bad state
-        praefect_manager.reset_primary_to_original
+        praefect_manager.start_all_nodes

        project = Resource::Project.fabricate! do |project|
          project.name = "gitaly_cluster"
@@ -25,25 +25,35 @@ module QA
      after(:context, quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }) do
        # Leave the cluster in a suitable state for subsequent tests,
        # if there was a problem during the tests here
-        praefect_manager.reset_primary_to_original
+        praefect_manager.start_all_nodes
      end

      it 'automatically fails over', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1267' do
        # Create a new project with a commit and wait for it to replicate
+
+        # make sure that our project is published to the 'primary' node
+        praefect_manager.stop_secondary_node
+        praefect_manager.stop_tertiary_node
+        praefect_manager.wait_for_secondary_node_health_check_failure
+        praefect_manager.wait_for_tertiary_node_health_check_failure
+
        Resource::Repository::ProjectPush.fabricate! do |push|
          push.project = project
          push.commit_message = first_added_commit_message
          push.new_branch = false
-          push.file_content = "This should exist on both nodes"
+          push.file_content = "This should exist on all nodes"
        end

+        praefect_manager.start_secondary_node
+        praefect_manager.start_tertiary_node
+        praefect_manager.wait_for_health_check_all_nodes
+
        praefect_manager.wait_for_replication(project.id)

        # Stop the primary node to trigger failover, and then wait
        # for Gitaly to be ready for writes again
-        praefect_manager.trigger_failover_by_stopping_primary_node
-        praefect_manager.wait_for_new_primary
-        praefect_manager.wait_for_health_check_current_primary_node
+        praefect_manager.stop_primary_node
+        praefect_manager.wait_for_primary_node_health_check_failure
        praefect_manager.wait_for_gitaly_check

        Resource::Repository::Commit.fabricate_via_api! do |commit|
@@ -69,7 +79,7 @@ module QA
        it 'automatically reconciles', quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }, testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1266' do
          # Start the old primary node again
          praefect_manager.start_primary_node
-          praefect_manager.wait_for_health_check_current_primary_node
+          praefect_manager.wait_for_primary_node_health_check

          # Confirm automatic reconciliation
          expect(praefect_manager.replicated?(project.id)).to be true
@@ -81,7 +91,7 @@ module QA
            .and include(second_added_commit_message)

          # Restore the original primary node
-          praefect_manager.reset_primary_to_original
+          praefect_manager.start_all_nodes

          # Check that all commits are still available even though the primary
          # node was offline when one was made

--- a/qa/qa/specs/features/api/3_create/gitaly/backend_node_recovery_spec.rb
+++ b/qa/qa/specs/features/api/3_create/gitaly/backend_node_recovery_spec.rb
@@ -14,12 +14,12 @@ module QA

        before do
          # Reset the cluster in case previous tests left it in a bad state
-          praefect_manager.reset_primary_to_original
+          praefect_manager.start_all_nodes
        end

        after do
          # Leave the cluster in a suitable state for subsequent tests
-          praefect_manager.reset_primary_to_original
+          praefect_manager.start_all_nodes
        end

        it 'recovers from dataloss', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1265' do
@@ -28,9 +28,7 @@ module QA

          # Stop the primary node to trigger failover, and then wait
          # for Gitaly to be ready for writes again
-          praefect_manager.trigger_failover_by_stopping_primary_node
-          praefect_manager.wait_for_new_primary
-          praefect_manager.wait_for_health_check_current_primary_node
+          praefect_manager.stop_primary_node
          praefect_manager.wait_for_gitaly_check

          # Confirm that we have access to the repo after failover