Commit 7b8e5b7d authored by Mark Lapierre's avatar Mark Lapierre

Add E2E test of backend node recovery

Includes changes to allow Resources to return enumerable results via
the API
parent 3d9857f7
...@@ -123,6 +123,10 @@ module QA ...@@ -123,6 +123,10 @@ module QA
"#{api_get_path}/runners" "#{api_get_path}/runners"
end end
def api_commits_path
"#{api_get_path}/repository/commits"
end
def api_repository_branches_path def api_repository_branches_path
"#{api_get_path}/repository/branches" "#{api_get_path}/repository/branches"
end end
...@@ -176,6 +180,10 @@ module QA ...@@ -176,6 +180,10 @@ module QA
raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish' raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish'
end end
def commits
parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url))
end
def import_status def import_status
response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url
......
...@@ -17,21 +17,31 @@ module QA ...@@ -17,21 +17,31 @@ module QA
@virtual_storage = 'default' @virtual_storage = 'default'
end end
def enable_writes # Executes the praefect `dataloss` command.
shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'" #
# @return [Boolean] whether dataloss has occurred
def dataloss?
wait_until_shell_command_matches(dataloss_command, /Outdated repositories/)
end end
def replicated?(project_id) def replicated?(project_id)
shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line| replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
QA::Runtime::Logger.debug(line.chomp)
# The output of the rake task looks something like this: # The output of the rake task looks something like this:
# #
# Project name | gitaly1 (primary) | gitaly2 | gitaly3 # Project name | gitaly1 (primary) | gitaly2 | gitaly3
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------- # ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 # gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
# #
# We want to confirm that the checksums are identical break line if line.start_with?("gitaly_cluster")
break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster") end
# We want to know if the checksums are identical
replicas.split('|').map(&:strip)[1..3].uniq.one?
end end
def start_primary_node
start_node(@primary_node)
end end
def start_praefect def start_praefect
...@@ -51,40 +61,54 @@ module QA ...@@ -51,40 +61,54 @@ module QA
end end
def trigger_failover_by_stopping_primary_node def trigger_failover_by_stopping_primary_node
QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
stop_node(@primary_node) stop_node(@primary_node)
end end
def clear_replication_queue def clear_replication_queue
QA::Runtime::Logger.debug("Clearing the replication queue") QA::Runtime::Logger.info("Clearing the replication queue")
shell <<~CMD shell sql_to_docker_exec_cmd(
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ <<~SQL
bash -c "psql -U postgres -d praefect_production -h postgres.test \ delete from replication_queue_job_lock;
-c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\"" delete from replication_queue_lock;
CMD delete from replication_queue;
SQL
)
end end
def create_stalled_replication_queue def create_stalled_replication_queue
QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks") QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks")
shell <<~CMD shell sql_to_docker_exec_cmd(
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ <<~SQL
bash -c "psql -U postgres -d praefect_production -h postgres.test \ update replication_queue set state = 'in_progress';
-c \\"update replication_queue set state = 'in_progress';
insert into replication_queue_job_lock (job_id, lock_id, triggered_at) insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
select id, rq.lock_id, created_at from replication_queue rq select id, rq.lock_id, created_at from replication_queue rq
left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
where state = 'in_progress' and rqjl.job_id is null; where state = 'in_progress' and rqjl.job_id is null;
update replication_queue_lock set acquired = 't';\\"" update replication_queue_lock set acquired = 't';
CMD SQL
)
end
# Reconciles the previous primary node with the current one
# I.e., it brings the previous primary node up-to-date
def reconcile_nodes
reconcile_node_with_node(@primary_node, current_primary_node)
end
def reconcile_node_with_node(target, reference)
QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}")
wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'",
/FINISHED: \d+ repos were checked for consistency/,
sleep_interval: 5,
retry_on_exception: true
)
end end
def replication_queue_lock_count def replication_queue_lock_count
result = [] result = []
cmd = <<~CMD shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
bash -c "psql -U postgres -d praefect_production -h postgres.test \
-c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
CMD
shell cmd do |line|
result << line result << line
end end
# The result looks like: # The result looks like:
...@@ -94,12 +118,36 @@ module QA ...@@ -94,12 +118,36 @@ module QA
result[2].to_i result[2].to_i
end end
# Makes the original primary (gitaly1) the primary again by
# stopping the other nodes, waiting for gitaly1 to be made the
# primary again, and then it starts the other nodes and enables
# writes
def reset_primary_to_original
QA::Runtime::Logger.info("Checking primary node...")
return if @primary_node == current_primary_node
QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
start_node(@primary_node)
stop_node(@secondary_node)
stop_node(@tertiary_node)
wait_for_new_primary_node(@primary_node)
start_node(@secondary_node)
start_node(@tertiary_node)
wait_for_health_check_all_nodes
wait_for_reliable_connection
end
def reset_cluster def reset_cluster
QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
start_node(@praefect) start_node(@praefect)
start_node(@primary_node) start_node(@primary_node)
start_node(@secondary_node) start_node(@secondary_node)
start_node(@tertiary_node) start_node(@tertiary_node)
enable_writes wait_for_health_check_all_nodes
end end
def verify_storage_move(source_storage, destination_storage) def verify_storage_move(source_storage, destination_storage)
...@@ -111,10 +159,33 @@ module QA ...@@ -111,10 +159,33 @@ module QA
end end
def wait_for_praefect def wait_for_praefect
QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
wait_until_shell_command_matches( wait_until_shell_command_matches(
"docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'", "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
/listening at tcp address/ /listening at tcp address/
) )
# Praefect can fail to start if unable to dial one of the gitaly nodes
# See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
# We tail the logs to allow us to confirm if that is the problem if tests fail
shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
end
end
def wait_for_new_primary_node(node)
QA::Runtime::Logger.info("Wait until #{node} is the primary node")
with_praefect_log do |log|
break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
end
end
def wait_for_new_primary
QA::Runtime::Logger.info("Wait until a new primary node is selected")
with_praefect_log do |log|
break true if log['msg'] == 'primary node changed'
end
end end
def wait_for_sql_ping def wait_for_sql_ping
...@@ -124,7 +195,33 @@ module QA ...@@ -124,7 +195,33 @@ module QA
) )
end end
def wait_for_no_praefect_storage_error
# If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
# That is, there's no message shown in the Praefect logs when the healthcheck succeeds
# To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
Support::Waiter.wait_until(max_duration: 120) do
wait_for_gitaly_check
sleep 5
shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
break true if log['msg'] != 'error when pinging healthcheck'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
end
def wait_for_storage_nodes def wait_for_storage_nodes
wait_for_no_praefect_storage_error
Support::Waiter.repeat_until(max_attempts: 3) do
nodes_confirmed = { nodes_confirmed = {
@primary_node => false, @primary_node => false,
@secondary_node => false, @secondary_node => false,
...@@ -132,7 +229,7 @@ module QA ...@@ -132,7 +229,7 @@ module QA
} }
wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line| wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
QA::Runtime::Logger.info(line.chomp) QA::Runtime::Logger.debug(line.chomp)
nodes_confirmed.each_key do |node| nodes_confirmed.each_key do |node|
nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/ nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
...@@ -141,13 +238,37 @@ module QA ...@@ -141,13 +238,37 @@ module QA
nodes_confirmed.values.all? nodes_confirmed.values.all?
end end
end end
end
def wait_for_health_check_current_primary_node
wait_for_health_check(current_primary_node)
end
def wait_for_health_check_all_nodes
wait_for_health_check(@primary_node)
wait_for_health_check(@secondary_node)
wait_for_health_check(@tertiary_node)
end
def wait_for_health_check(node)
QA::Runtime::Logger.info("Waiting for health check on #{node}")
wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
def wait_for_gitaly_check def wait_for_gitaly_check
Support::Waiter.repeat_until(max_attempts: 3) do
storage_ok = false storage_ok = false
check_finished = false check_finished = false
wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line| wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
QA::Runtime::Logger.info(line.chomp) QA::Runtime::Logger.debug(line.chomp)
storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/ storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
check_finished = true if line =~ /Checking Gitaly ... Finished/ check_finished = true if line =~ /Checking Gitaly ... Finished/
...@@ -155,6 +276,7 @@ module QA ...@@ -155,6 +276,7 @@ module QA
storage_ok && check_finished storage_ok && check_finished
end end
end end
end
def wait_for_gitlab_shell_check def wait_for_gitlab_shell_check
wait_until_shell_command_matches( wait_until_shell_command_matches(
...@@ -164,15 +286,36 @@ module QA ...@@ -164,15 +286,36 @@ module QA
end end
def wait_for_reliable_connection def wait_for_reliable_connection
QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
wait_for_praefect wait_for_praefect
wait_for_sql_ping wait_for_sql_ping
wait_for_storage_nodes wait_for_storage_nodes
wait_for_gitaly_check
wait_for_gitlab_shell_check wait_for_gitlab_shell_check
end end
def wait_for_replication(project_id)
Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
end
private private
def current_primary_node
shell dataloss_command do |line|
QA::Runtime::Logger.debug(line.chomp)
match = line.match(/Primary: (.*)/)
break match[1] if match
end
end
def dataloss_command
"docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'"
end
def sql_to_docker_exec_cmd(sql)
Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres)
end
def verify_storage_move_from_gitaly(storage) def verify_storage_move_from_gitaly(storage)
wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line| wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line|
log = JSON.parse(line) log = JSON.parse(line)
...@@ -203,21 +346,10 @@ module QA ...@@ -203,21 +346,10 @@ module QA
end end
end end
def wait_until_shell_command(cmd, **kwargs) def with_praefect_log
sleep_interval = kwargs.delete(:sleep_interval) || 1 wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do yield JSON.parse(line)
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
QA::Runtime::Logger.info(line.chomp)
line =~ regex
end end
end end
end end
......
...@@ -33,6 +33,31 @@ module QA ...@@ -33,6 +33,31 @@ module QA
end end
end end
end end
def sql_to_docker_exec_cmd(sql, username, password, database, host, container)
<<~CMD
docker exec --env PGPASSWORD=#{password} #{container} \
bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\""
CMD
end
def wait_until_shell_command(cmd, **kwargs)
sleep_interval = kwargs.delete(:sleep_interval) || 1
Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
shell cmd do |line|
break true if yield line
end
end
end
def wait_until_shell_command_matches(cmd, regex, **kwargs)
wait_until_shell_command(cmd, kwargs) do |line|
QA::Runtime::Logger.debug(line.chomp)
line =~ regex
end
end
end end
end end
end end
# frozen_string_literal: true
module QA
RSpec.describe 'Create' do
context 'Gitaly' do
describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do
let(:praefect_manager) { Service::PraefectManager.new }
let(:project) do
Resource::Project.fabricate! do |project|
project.name = "gitaly_cluster"
project.initialize_with_readme = true
end
end
before do
# Reset the cluster in case previous tests left it in a bad state
praefect_manager.reset_primary_to_original
end
after do
# Leave the cluster in a suitable state for subsequent tests
praefect_manager.reset_primary_to_original
end
it 'recovers from dataloss' do
# Create a new project with a commit and wait for it to replicate
praefect_manager.wait_for_replication(project.id)
# Stop the primary node to trigger failover, and then wait
# for Gitaly to be ready for writes again
praefect_manager.trigger_failover_by_stopping_primary_node
praefect_manager.wait_for_new_primary
praefect_manager.wait_for_health_check_current_primary_node
praefect_manager.wait_for_gitaly_check
# Confirm that we have access to the repo after failover
Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do
Resource::Repository::Commit.fabricate_via_api! do |commits|
commits.project = project
commits.sha = 'master'
end
end
# Push a commit to the new primary
Resource::Repository::ProjectPush.fabricate! do |push|
push.project = project
push.new_branch = false
push.commit_message = 'pushed after failover'
push.file_name = 'new_file'
push.file_content = 'new file'
end
# Start the old primary node again
praefect_manager.start_primary_node
praefect_manager.wait_for_health_check_current_primary_node
# Confirm dataloss (i.e., inconsistent nodes)
expect(praefect_manager.dataloss?).to be true
expect(praefect_manager.replicated?(project.id)).to be false
# Reconcile nodes to recover from dataloss
praefect_manager.reconcile_nodes
praefect_manager.wait_for_replication(project.id)
# Confirm that both commits are available after reconciliation
expect(project.commits.map { |commit| commit[:message].chomp })
.to include("Initial commit").and include("pushed after failover")
end
end
end
end
end
...@@ -41,8 +41,6 @@ module QA ...@@ -41,8 +41,6 @@ module QA
expect(show).to have_file(initial_file) expect(show).to have_file(initial_file)
end end
praefect_manager.enable_writes
Resource::Repository::Commit.fabricate_via_api! do |commit| Resource::Repository::Commit.fabricate_via_api! do |commit|
commit.project = project commit.project = project
commit.add_files([ commit.add_files([
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment