Commit 28e47ff8 authored by Mark Lapierre's avatar Mark Lapierre

Merge branch 'jmd-fixing-gitaly-cluster-e2e-tests' into 'master'

Updating Gitaly Cluster E2E test logic

See merge request gitlab-org/gitlab!71789
parents f21f76b8 b9839a17
......@@ -46,6 +46,10 @@ module QA
end
end
def stop_primary_node
stop_node(@primary_node)
end
def start_primary_node
start_node(@primary_node)
end
......@@ -66,20 +70,29 @@ module QA
start_node(@secondary_node)
end
def stop_tertiary_node
stop_node(@tertiary_node)
end
def start_tertiary_node
start_node(@tertiary_node)
end
def start_node(name)
shell "docker start #{name}"
wait_until_shell_command_matches(
"docker inspect -f {{.State.Running}} #{name}",
/true/,
sleep_interval: 3,
max_duration: 180,
retry_on_exception: true
)
end
def stop_node(name)
shell "docker stop #{name}"
end
def trigger_failover_by_stopping_primary_node
QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node)
wait_for_new_primary
end
def clear_replication_queue
QA::Runtime::Logger.info("Clearing the replication queue")
shell sql_to_docker_exec_cmd(
......@@ -157,22 +170,8 @@ module QA
result[2].to_i
end
# Makes the original primary (gitaly1) the primary again by
# stopping the other nodes, waiting for gitaly1 to be made the
# primary again, and then it starts the other nodes and enables
# writes
def reset_primary_to_original
QA::Runtime::Logger.info("Checking primary node...")
return if @primary_node == current_primary_node
QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
def start_all_nodes
start_node(@primary_node)
stop_node(@secondary_node)
stop_node(@tertiary_node)
wait_for_new_primary_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
......@@ -189,10 +188,12 @@ module QA
end
def wait_for_praefect
QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
/listening at tcp address/
"docker inspect -f {{.State.Running}} #{@praefect}",
/true/,
sleep_interval: 3,
max_duration: 180,
retry_on_exception: true
)
# Praefect can fail to start if unable to dial one of the gitaly nodes
......@@ -204,20 +205,6 @@ module QA
end
end
def wait_for_new_primary_node(node)
QA::Runtime::Logger.info("Wait until #{node} is the primary node")
with_praefect_log(max_duration: 120) do |log|
break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
end
end
def wait_for_new_primary
QA::Runtime::Logger.info("Wait until a new primary node is selected")
with_praefect_log(max_duration: 120) do |log|
break true if log['msg'] == 'primary node changed'
end
end
def wait_for_sql_ping
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping'",
......@@ -274,10 +261,6 @@ module QA
end
end
def wait_for_health_check_current_primary_node
wait_for_health_check(current_primary_node)
end
def wait_for_health_check_all_nodes
wait_for_health_check(@primary_node)
wait_for_health_check(@secondary_node)
......@@ -286,29 +269,58 @@ module QA
def wait_for_health_check(node)
QA::Runtime::Logger.info("Waiting for health check on #{node}")
wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
wait_until_node_is_marked_as_healthy_storage(node)
end
log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
def wait_for_primary_node_health_check
wait_for_health_check(@primary_node)
end
def wait_for_secondary_node_health_check
wait_for_health_check(@secondary_node)
end
def wait_for_tertiary_node_health_check
wait_for_health_check(@tertiary_node)
end
def wait_for_health_check_failure(node)
QA::Runtime::Logger.info("Waiting for health check failure on #{node}")
wait_until_node_is_removed_from_healthy_storages(node)
end
def wait_for_primary_node_health_check_failure
wait_for_health_check_failure(@primary_node)
end
def wait_for_secondary_node_health_check_failure
wait_for_health_check_failure(@secondary_node)
end
def wait_for_health_check_failure(node)
QA::Runtime::Logger.info("Waiting for Praefect to record a health check failure on #{node}")
wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
def wait_for_tertiary_node_health_check_failure
wait_for_health_check_failure(@tertiary_node)
end
health_check_failure_message?(log['msg']) && log['storage'] == node
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
def wait_until_node_is_removed_from_healthy_storages(node)
Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line
end
QA::Runtime::Logger.debug("result is ---#{result}")
result[2].to_i == 0
end
end
def wait_until_node_is_marked_as_healthy_storage(node)
Support::Waiter.wait_until(max_duration: 60, sleep_interval: 3, raise_on_failure: false) do
result = []
shell sql_to_docker_exec_cmd("SELECT count(*) FROM healthy_storages WHERE storage = '#{node}';") do |line|
result << line
end
QA::Runtime::Logger.debug("result is ---#{result}")
result[2].to_i == 1
end
end
......
......@@ -52,7 +52,7 @@ module QA
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
wait_until_shell_command(cmd, **kwargs) do |line|
QA::Runtime::Logger.debug(line.chomp)
line =~ regex
......
......@@ -14,7 +14,7 @@ module QA
before(:context) do
# Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original
praefect_manager.start_all_nodes
project = Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster"
......@@ -25,25 +25,35 @@ module QA
after(:context, quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }) do
# Leave the cluster in a suitable state for subsequent tests,
# if there was a problem during the tests here
praefect_manager.reset_primary_to_original
praefect_manager.start_all_nodes
end
it 'automatically fails over', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1267' do
# Create a new project with a commit and wait for it to replicate
# make sure that our project is published to the 'primary' node
praefect_manager.stop_secondary_node
praefect_manager.stop_tertiary_node
praefect_manager.wait_for_secondary_node_health_check_failure
praefect_manager.wait_for_tertiary_node_health_check_failure
Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project
push.commit_message = first_added_commit_message
push.new_branch = false
push.file_content = "This should exist on both nodes"
push.file_content = "This should exist on all nodes"
end
praefect_manager.start_secondary_node
praefect_manager.start_tertiary_node
praefect_manager.wait_for_health_check_all_nodes
praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.stop_primary_node
praefect_manager.wait_for_primary_node_health_check_failure
praefect_manager.wait_for_gitaly_check
Resource::Repository::Commit.fabricate_via_api! do |commit|
......@@ -69,7 +79,7 @@ module QA
it 'automatically reconciles', quarantine: { issue: 'https://gitlab.com/gitlab-org/gitlab/-/issues/238187', type: :stale }, testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1266' do
# Start the old primary node again
praefect_manager.start_primary_node
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_primary_node_health_check
# Confirm automatic reconciliation
expect(praefect_manager.replicated?(project.id)).to be true
......@@ -81,7 +91,7 @@ module QA
.and include(second_added_commit_message)
# Restore the original primary node
praefect_manager.reset_primary_to_original
praefect_manager.start_all_nodes
# Check that all commits are still available even though the primary
# node was offline when one was made
......
......@@ -14,12 +14,12 @@ module QA
before do
# Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original
praefect_manager.start_all_nodes
end
after do
# Leave the cluster in a suitable state for subsequent tests
praefect_manager.reset_primary_to_original
praefect_manager.start_all_nodes
end
it 'recovers from dataloss', testcase: 'https://gitlab.com/gitlab-org/quality/testcases/-/quality/test_cases/1265' do
......@@ -28,9 +28,7 @@ module QA
# Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.stop_primary_node
praefect_manager.wait_for_gitaly_check
# Confirm that we have access to the repo after failover
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment