Commit ed7546b5 authored by Mark Lapierre's avatar Mark Lapierre Committed by Ramya Authappan

Add e2e test of distributed reads from unhealthy node

parent 932f5927
......@@ -27,6 +27,7 @@ module QA
end
def replicated?(project_id)
Support::Retrier.retry_until(raise_on_failure: false) do
replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
QA::Runtime::Logger.debug(line.chomp)
# The output of the rake task looks something like this:
......@@ -35,11 +36,13 @@ module QA
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------
# gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
#
break line if line.start_with?("gitaly_cluster")
break line if line.start_with?('gitaly_cluster')
break nil if line.include?('Something went wrong when getting replicas')
end
# We want to know if the checksums are identical
replicas.split('|').map(&:strip)[1..3].uniq.one?
replicas&.split('|')&.map(&:strip)&.slice(1..3)&.uniq&.one?
end
end
def start_primary_node
......@@ -54,6 +57,14 @@ module QA
stop_node(@praefect)
end
def stop_secondary_node
stop_node(@secondary_node)
end
def start_secondary_node
start_node(@secondary_node)
end
def start_node(name)
shell "docker start #{name}"
end
......@@ -120,6 +131,18 @@ module QA
result['data']['result'].map { |result| { node: result['metric']['storage'], value: result['value'][1].to_i } }
end
def replication_queue_incomplete_count
result = []
shell sql_to_docker_exec_cmd("select count(*) from replication_queue where state = 'ready' or state = 'in_progress';") do |line|
result << line
end
# The result looks like:
# count
# -----
# 1
result[2].to_i
end
def replication_queue_lock_count
result = []
shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
......@@ -276,6 +299,22 @@ module QA
end
end
def wait_for_secondary_node_health_check_failure
wait_for_health_check_failure(@secondary_node)
end
def wait_for_health_check_failure(node)
QA::Runtime::Logger.info("Waiting for Praefect to record a health check failure on #{node}")
wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
QA::Runtime::Logger.debug(line.chomp)
log = JSON.parse(line)
log['msg'] == 'error when pinging healthcheck' && log['storage'] == node
rescue JSON::ParserError
# Ignore lines that can't be parsed as JSON
end
end
def wait_for_gitaly_check
Support::Waiter.repeat_until(max_attempts: 3) do
storage_ok = false
......@@ -292,35 +331,33 @@ module QA
end
end
def wait_for_gitlab_shell_check
wait_until_shell_command_matches(
"docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitlab_shell:check'",
/Checking GitLab Shell ... Finished/
)
end
# Waits until there is an increase in the number of reads for
# any node compared to the number of reads provided
# any node compared to the number of reads provided. If a node
# has no pre-read data, consider it to have had zero reads.
def wait_for_read_count_change(pre_read_data)
diff_found = false
Support::Waiter.wait_until(sleep_interval: 5) do
query_read_distribution.each_with_index do |data, index|
diff_found = true if data[:value] > pre_read_data[index][:value]
diff_found = true if data[:value] > value_for_node(pre_read_data, data[:node])
end
diff_found
end
end
def value_for_node(data, node)
data.find(-> {0}) { |item| item[:node] == node }[:value]
end
def wait_for_reliable_connection
QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
wait_for_praefect
wait_for_sql_ping
wait_for_storage_nodes
wait_for_gitlab_shell_check
wait_for_gitaly_check
end
def wait_for_replication(project_id)
Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
Support::Waiter.wait_until(sleep_interval: 1) { replication_queue_incomplete_count == 0 && replicated?(project_id) }
end
private
......
......@@ -29,19 +29,46 @@ module QA
pre_read_data = praefect_manager.query_read_distribution
QA::Runtime::Logger.info('Fetching commits from the repository')
Parallel.each((1..number_of_reads)) do |index|
Resource::Repository::Commit.fabricate_via_api! do |commits|
commits.project = project
end
end
Parallel.each((1..number_of_reads)) { project.commits }
praefect_manager.wait_for_read_count_change(pre_read_data)
aggregate_failures "each gitaly node" do
praefect_manager.query_read_distribution.each_with_index do |data, index|
expect(data[:value])
.to be > pre_read_data[index][:value],
"Read counts did not differ for node #{pre_read_data[index][:node]}"
.to be > praefect_manager.value_for_node(pre_read_data, data[:node]),
"Read counts did not differ for node #{data[:node]}"
end
end
end
context 'when a node is unhealthy' do
before do
praefect_manager.stop_secondary_node
praefect_manager.wait_for_secondary_node_health_check_failure
end
after do
# Leave the cluster in a suitable state for subsequent tests
praefect_manager.start_secondary_node
praefect_manager.wait_for_health_check_all_nodes
praefect_manager.wait_for_reliable_connection
end
it 'does not read from the unhealthy node' do
pre_read_data = praefect_manager.query_read_distribution
QA::Runtime::Logger.info('Fetching commits from the repository')
Parallel.each((1..number_of_reads)) { project.commits }
praefect_manager.wait_for_read_count_change(pre_read_data)
post_read_data = praefect_manager.query_read_distribution
aggregate_failures "each gitaly node" do
expect(praefect_manager.value_for_node(post_read_data, 'gitaly1')).to be > praefect_manager.value_for_node(pre_read_data, 'gitaly1')
expect(praefect_manager.value_for_node(post_read_data, 'gitaly2')).to eq praefect_manager.value_for_node(pre_read_data, 'gitaly2')
expect(praefect_manager.value_for_node(post_read_data, 'gitaly3')).to be > praefect_manager.value_for_node(pre_read_data, 'gitaly3')
end
end
end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment