Commit 55c3ea37 authored by Imre Farkas's avatar Imre Farkas

Merge branch 'topology_collect_node_cpu_memory_utilization' into 'master'

Collect node CPU and memory utilization in usage ping

See merge request gitlab-org/gitlab!38681
parents 40b10e2b 3098e1be
---
title: Collect node CPU and memory utilization in usage ping
merge_request: 38681
author:
type: other
......@@ -593,7 +593,9 @@ The following is example content of the Usage Ping payload.
"nodes": [
{
"node_memory_total_bytes": 33269903360,
"node_memory_utilization": 0.35,
"node_cpus": 16,
"node_cpu_utilization": 0.2,
"node_uname_info": {
"machine": "x86_64",
"sysname": "Linux",
......
......@@ -77,12 +77,12 @@ module Gitlab
# metric labels to their respective values.
#
# @return [Hash] mapping labels to their aggregate numeric values, or the empty hash if no results were found
def aggregate(aggregate_query, time: Time.now)
def aggregate(aggregate_query, time: Time.now, transform_value: :to_f)
response = query(aggregate_query, time: time)
response.to_h do |result|
key = block_given? ? yield(result['metric']) : result['metric']
_timestamp, value = result['value']
[key, value.to_i]
[key, value.public_send(transform_value)] # rubocop:disable GitlabSecurity/PublicSend
end
end
......
......@@ -63,7 +63,9 @@ module Gitlab
def topology_node_data(client)
# node-level data
by_instance_mem = topology_node_memory(client)
by_instance_mem_utilization = topology_node_memory_utilization(client)
by_instance_cpus = topology_node_cpus(client)
by_instance_cpu_utilization = topology_node_cpu_utilization(client)
by_instance_uname_info = topology_node_uname_info(client)
# service-level data
by_instance_by_job_by_type_memory = topology_all_service_memory(client)
......@@ -73,7 +75,9 @@ module Gitlab
@instances.map do |instance|
{
node_memory_total_bytes: by_instance_mem[instance],
node_memory_utilization: by_instance_mem_utilization[instance],
node_cpus: by_instance_cpus[instance],
node_cpu_utilization: by_instance_cpu_utilization[instance],
node_uname_info: by_instance_uname_info[instance],
node_services:
topology_node_services(
......@@ -89,12 +93,24 @@ module Gitlab
end
end
def topology_node_memory_utilization(client)
query_safely('gitlab_usage_ping:node_memory_utilization:avg', 'node_memory_utilization', fallback: {}) do |query|
aggregate_by_instance(client, aggregate_one_week(query), transform_value: :to_f)
end
end
def topology_node_cpus(client)
query_safely('gitlab_usage_ping:node_cpus:count', 'node_cpus', fallback: {}) do |query|
aggregate_by_instance(client, aggregate_one_week(query, aggregation: :max))
end
end
def topology_node_cpu_utilization(client)
query_safely('gitlab_usage_ping:node_cpu_utilization:avg', 'node_cpu_utilization', fallback: {}) do |query|
aggregate_by_instance(client, aggregate_one_week(query), transform_value: :to_f)
end
end
def topology_node_uname_info(client)
node_uname_info = query_safely('node_uname_info', 'node_uname_info', fallback: []) do |query|
client.query(query)
......@@ -235,13 +251,13 @@ module Gitlab
"#{aggregation}_over_time (#{query}[1w])"
end
def aggregate_by_instance(client, query)
client.aggregate(query) { |metric| normalize_and_track_instance(metric['instance']) }
def aggregate_by_instance(client, query, transform_value: :to_i)
client.aggregate(query, transform_value: transform_value) { |metric| normalize_and_track_instance(metric['instance']) }
end
# Will retain a composite key that values are mapped to
def aggregate_by_labels(client, query)
client.aggregate(query) do |metric|
def aggregate_by_labels(client, query, transform_value: :to_i)
client.aggregate(query, transform_value: transform_value) do |metric|
metric['instance'] = normalize_and_track_instance(metric['instance'])
metric
end
......
......@@ -24,7 +24,9 @@ RSpec.describe Gitlab::UsageData::Topology do
expect_prometheus_api_to(
receive_app_request_volume_query,
receive_node_memory_query,
receive_node_memory_utilization_query,
receive_node_cpu_count_query,
receive_node_cpu_utilization_query,
receive_node_uname_info_query,
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query,
......@@ -40,7 +42,9 @@ RSpec.describe Gitlab::UsageData::Topology do
nodes: [
{
node_memory_total_bytes: 512,
node_memory_utilization: 0.45,
node_cpus: 8,
node_cpu_utilization: 0.1,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
......@@ -64,7 +68,9 @@ RSpec.describe Gitlab::UsageData::Topology do
},
{
node_memory_total_bytes: 1024,
node_memory_utilization: 0.25,
node_cpus: 16,
node_cpu_utilization: 0.2,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
......@@ -102,7 +108,9 @@ RSpec.describe Gitlab::UsageData::Topology do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_memory_utilization_query(result: []),
receive_node_cpu_count_query,
receive_node_cpu_utilization_query,
receive_node_uname_info_query,
receive_node_service_memory_rss_query(result: []),
receive_node_service_memory_uss_query(result: []),
......@@ -116,6 +124,7 @@ RSpec.describe Gitlab::UsageData::Topology do
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'node_memory_utilization' => 'empty_result' },
{ 'service_rss' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_workers' => 'empty_result' }
......@@ -123,6 +132,7 @@ RSpec.describe Gitlab::UsageData::Topology do
nodes: [
{
node_cpus: 16,
node_cpu_utilization: 0.2,
node_uname_info: {
machine: 'x86_64',
release: '4.15.0-101-generic',
......@@ -146,6 +156,7 @@ RSpec.describe Gitlab::UsageData::Topology do
},
{
node_cpus: 8,
node_cpu_utilization: 0.1,
node_uname_info: {
machine: 'x86_64',
release: '4.19.76-linuxkit',
......@@ -178,6 +189,15 @@ RSpec.describe Gitlab::UsageData::Topology do
]
end
let(:node_memory_utilization_response) do
[
{
'metric' => { 'instance' => 'localhost:9100' },
'value' => [1000, '0.35']
}
]
end
let(:node_uname_info_response) do
[
{
......@@ -226,7 +246,9 @@ RSpec.describe Gitlab::UsageData::Topology do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: node_memory_response),
receive_node_memory_utilization_query(result: node_memory_utilization_response),
receive_node_cpu_count_query(result: []),
receive_node_cpu_utilization_query(result: []),
receive_node_uname_info_query(result: node_uname_info_response),
receive_node_service_memory_rss_query(result: service_memory_response),
receive_node_service_memory_uss_query(result: []),
......@@ -240,6 +262,7 @@ RSpec.describe Gitlab::UsageData::Topology do
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'node_cpu_utilization' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
{ 'service_process_count' => 'empty_result' },
......@@ -248,6 +271,7 @@ RSpec.describe Gitlab::UsageData::Topology do
nodes: [
{
node_memory_total_bytes: 512,
node_memory_utilization: 0.35,
node_uname_info: {
machine: 'x86_64',
sysname: 'Linux',
......@@ -286,7 +310,9 @@ RSpec.describe Gitlab::UsageData::Topology do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_memory_utilization_query(result: []),
receive_node_cpu_count_query(result: []),
receive_node_cpu_utilization_query(result: []),
receive_node_uname_info_query(result: []),
receive_node_service_memory_rss_query,
receive_node_service_memory_uss_query(result: []),
......@@ -300,7 +326,9 @@ RSpec.describe Gitlab::UsageData::Topology do
failures: [
{ 'app_requests' => 'empty_result' },
{ 'node_memory' => 'empty_result' },
{ 'node_memory_utilization' => 'empty_result' },
{ 'node_cpus' => 'empty_result' },
{ 'node_cpu_utilization' => 'empty_result' },
{ 'node_uname_info' => 'empty_result' },
{ 'service_uss' => 'empty_result' },
{ 'service_pss' => 'empty_result' },
......@@ -355,7 +383,9 @@ RSpec.describe Gitlab::UsageData::Topology do
expect_prometheus_api_to(
receive_app_request_volume_query(result: []),
receive_node_memory_query(result: []),
receive_node_memory_utilization_query(result: []),
receive_node_cpu_count_query(result: []),
receive_node_cpu_utilization_query(result: []),
receive_node_uname_info_query(result: []),
receive_node_service_memory_rss_query(result: []),
receive_node_service_memory_uss_query(result: []),
......@@ -382,7 +412,9 @@ RSpec.describe Gitlab::UsageData::Topology do
failures: [
{ 'app_requests' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_memory_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpus' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_cpu_utilization' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'node_uname_info' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_rss' => 'Gitlab::PrometheusClient::ConnectionError' },
{ 'service_uss' => 'Gitlab::PrometheusClient::ConnectionError' },
......@@ -447,6 +479,21 @@ RSpec.describe Gitlab::UsageData::Topology do
])
end
def receive_node_memory_utilization_query(result: nil)
receive(:query)
.with(/node_memory_utilization/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance1:8080' },
'value' => [1000, '0.45']
},
{
'metric' => { 'instance' => 'instance2:8090' },
'value' => [1000, '0.25']
}
])
end
def receive_node_cpu_count_query(result: nil)
receive(:query)
.with(/node_cpus/, an_instance_of(Hash))
......@@ -462,6 +509,21 @@ RSpec.describe Gitlab::UsageData::Topology do
])
end
def receive_node_cpu_utilization_query(result: nil)
receive(:query)
.with(/node_cpu_utilization/, an_instance_of(Hash))
.and_return(result || [
{
'metric' => { 'instance' => 'instance2:8090' },
'value' => [1000, '0.2']
},
{
'metric' => { 'instance' => 'instance1:8080' },
'value' => [1000, '0.1']
}
])
end
def receive_node_uname_info_query(result: nil)
receive(:query)
.with('node_uname_info')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment