Commit 9bd56a22 authored by Michael Kozono's avatar Michael Kozono

Merge branch 'mwaw/persist_postgres_hll_buckets_in_redis' into 'master'

Persist postgres hll buckets in redis

See merge request gitlab-org/gitlab!48778
parents 8b4c3352 12c461aa
...@@ -436,10 +436,20 @@ module EE ...@@ -436,10 +436,20 @@ module EE
.where('security_scans.scan_type = ?', scan_type) .where('security_scans.scan_type = ?', scan_type)
.where(security_scans: time_period) .where(security_scans: time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] = metric_name = "#{name}_pipeline"
aggregated_metrics_params = {
metric_name: metric_name,
recorded_at_timestamp: recorded_at,
time_period: time_period
}
pipelines_with_secure_jobs[metric_name.to_sym] =
if start_id && finish_id if start_id && finish_id
estimate_batch_distinct_count(relation, :commit_id, batch_size: 1000, start: start_id, finish: finish_id) estimate_batch_distinct_count(relation, :commit_id, batch_size: 1000, start: start_id, finish: finish_id) do |result|
save_aggregated_metrics(aggregated_metrics_params.merge(data: result))
end
else else
save_aggregated_metrics(aggregated_metrics_params.merge(data: ::Gitlab::Database::PostgresHll::Buckets.new))
0 0
end end
end end
...@@ -500,29 +510,29 @@ module EE ...@@ -500,29 +510,29 @@ module EE
def merge_requests_with_overridden_project_rules(time_period = nil) def merge_requests_with_overridden_project_rules(time_period = nil)
sql = sql =
<<~SQL <<~SQL
(EXISTS ( (EXISTS (
SELECT
1
FROM
approval_merge_request_rule_sources
WHERE
approval_merge_request_rule_sources.approval_merge_request_rule_id = approval_merge_request_rules.id
AND NOT EXISTS (
SELECT SELECT
1 1
FROM FROM
approval_project_rules approval_merge_request_rule_sources
WHERE WHERE
approval_project_rules.id = approval_merge_request_rule_sources.approval_project_rule_id approval_merge_request_rule_sources.approval_merge_request_rule_id = approval_merge_request_rules.id
AND EXISTS ( AND NOT EXISTS (
SELECT SELECT
1 1
FROM FROM
projects approval_project_rules
WHERE WHERE
projects.id = approval_project_rules.project_id approval_project_rules.id = approval_merge_request_rule_sources.approval_project_rule_id
AND projects.disable_overriding_approvers_per_merge_request = FALSE)))) AND EXISTS (
OR("approval_merge_request_rules"."modified_from_project_rule" = TRUE) SELECT
1
FROM
projects
WHERE
projects.id = approval_project_rules.project_id
AND projects.disable_overriding_approvers_per_merge_request = FALSE))))
OR("approval_merge_request_rules"."modified_from_project_rule" = TRUE)
SQL SQL
distinct_count( distinct_count(
......
...@@ -23,6 +23,7 @@ module Gitlab ...@@ -23,6 +23,7 @@ module Gitlab
deployment_minimum_id deployment_minimum_id
deployment_maximum_id deployment_maximum_id
auth_providers auth_providers
recorded_at
).freeze ).freeze
class << self class << self
...@@ -75,7 +76,7 @@ module Gitlab ...@@ -75,7 +76,7 @@ module Gitlab
end end
def recorded_at def recorded_at
Time.current @recorded_at ||= Time.current
end end
# rubocop: disable Metrics/AbcSize # rubocop: disable Metrics/AbcSize
......
...@@ -39,6 +39,9 @@ module Gitlab ...@@ -39,6 +39,9 @@ module Gitlab
FALLBACK = -1 FALLBACK = -1
DISTRIBUTED_HLL_FALLBACK = -2 DISTRIBUTED_HLL_FALLBACK = -2
ALL_TIME_PERIOD_HUMAN_NAME = "all_time"
WEEKLY_PERIOD_HUMAN_NAME = "weekly"
MONTHLY_PERIOD_HUMAN_NAME = "monthly"
def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil) def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil)
if batch if batch
...@@ -61,10 +64,13 @@ module Gitlab ...@@ -61,10 +64,13 @@ module Gitlab
end end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil) def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHll::BatchDistinctCounter buckets = Gitlab::Database::PostgresHll::BatchDistinctCounter
.new(relation, column) .new(relation, column)
.execute(batch_size: batch_size, start: start, finish: finish) .execute(batch_size: batch_size, start: start, finish: finish)
.estimated_distinct_count
yield buckets if block_given?
buckets.estimated_distinct_count
rescue ActiveRecord::StatementInvalid rescue ActiveRecord::StatementInvalid
FALLBACK FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue # catch all rescue should be removed as a part of feature flag rollout issue
...@@ -74,6 +80,27 @@ module Gitlab ...@@ -74,6 +80,27 @@ module Gitlab
DISTRIBUTED_HLL_FALLBACK DISTRIBUTED_HLL_FALLBACK
end end
def save_aggregated_metrics(metric_name:, time_period:, recorded_at_timestamp:, data:)
unless data.is_a? ::Gitlab::Database::PostgresHll::Buckets
Gitlab::ErrorTracking.track_and_raise_for_dev_exception(StandardError.new("Unsupported data type: #{data.class}"))
return
end
# the longest recorded usage ping generation time for gitlab.com
# was below 40 hours, there is added error margin of 20 h
usage_ping_generation_period = 80.hours
# add timestamp at the end of the key to avoid stale keys if
# usage ping job is retried
redis_key = "#{metric_name}_#{time_period_to_human_name(time_period)}-#{recorded_at_timestamp}"
Gitlab::Redis::SharedState.with do |redis|
redis.set(redis_key, data.to_json, ex: usage_ping_generation_period)
end
rescue ::Redis::CommandError => e
Gitlab::ErrorTracking.track_and_raise_for_dev_exception(e)
end
def sum(relation, column, batch_size: nil, start: nil, finish: nil) def sum(relation, column, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish) Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid rescue ActiveRecord::StatementInvalid
...@@ -125,6 +152,20 @@ module Gitlab ...@@ -125,6 +152,20 @@ module Gitlab
Gitlab::UsageDataCounters::HLLRedisCounter.track_event(event_name.to_s, values: values) Gitlab::UsageDataCounters::HLLRedisCounter.track_event(event_name.to_s, values: values)
end end
def time_period_to_human_name(time_period)
return ALL_TIME_PERIOD_HUMAN_NAME if time_period.blank?
date_range = time_period.values[0]
start_date = date_range.first.to_date
end_date = date_range.last.to_date
if (end_date - start_date).to_i > 7
MONTHLY_PERIOD_HUMAN_NAME
else
WEEKLY_PERIOD_HUMAN_NAME
end
end
private private
def prometheus_client(verify:) def prometheus_client(verify:)
......
...@@ -58,6 +58,16 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -58,6 +58,16 @@ RSpec.describe Gitlab::Utils::UsageData do
expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5) expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
end end
it 'yield provided block with PostgresHll::Buckets' do
buckets = Gitlab::Database::PostgresHll::Buckets.new
allow_next_instance_of(Gitlab::Database::PostgresHll::BatchDistinctCounter) do |instance|
allow(instance).to receive(:execute).and_return(buckets)
end
expect { |block| described_class.estimate_batch_distinct_count(relation, 'column', &block) }.to yield_with_args(buckets)
end
context 'quasi integration test for different counting parameters' do context 'quasi integration test for different counting parameters' do
# HyperLogLog http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf algorithm # HyperLogLog http://algo.inria.fr/flajolet/Publications/FlFuGaMe07.pdf algorithm
# used in estimate_batch_distinct_count produce probabilistic # used in estimate_batch_distinct_count produce probabilistic
...@@ -362,4 +372,97 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -362,4 +372,97 @@ RSpec.describe Gitlab::Utils::UsageData do
end end
end end
end end
describe '#save_aggregated_metrics', :clean_gitlab_redis_shared_state do
let(:timestamp) { Time.current.to_i }
let(:time_period) { { created_at: 7.days.ago..Date.current } }
let(:metric_name) { 'test_metric' }
let(:method_params) do
{
metric_name: metric_name,
time_period: time_period,
recorded_at_timestamp: timestamp,
data: data
}
end
context 'with compatible data argument' do
let(:data) { ::Gitlab::Database::PostgresHll::Buckets.new(141 => 1, 56 => 1) }
it 'persists serialized data in Redis' do
time_period_name = 'weekly'
expect(described_class).to receive(:time_period_to_human_name).with(time_period).and_return(time_period_name)
Gitlab::Redis::SharedState.with do |redis|
expect(redis).to receive(:set).with("#{metric_name}_#{time_period_name}-#{timestamp}", '{"141":1,"56":1}', ex: 80.hours)
end
described_class.save_aggregated_metrics(method_params)
end
context 'error handling' do
before do
allow(Gitlab::Redis::SharedState).to receive(:with).and_raise(::Redis::CommandError)
end
it 'rescues and reraise ::Redis::CommandError for development and test environments' do
expect { described_class.save_aggregated_metrics(method_params) }.to raise_error ::Redis::CommandError
end
context 'for environment different than development' do
before do
stub_rails_env('production')
end
it 'rescues ::Redis::CommandError' do
expect { described_class.save_aggregated_metrics(method_params) }.not_to raise_error
end
end
end
end
context 'with incompatible data argument' do
let(:data) { 1 }
context 'for environment different than development' do
before do
stub_rails_env('production')
end
it 'does not persist data in Redis' do
Gitlab::Redis::SharedState.with do |redis|
expect(redis).not_to receive(:set)
end
described_class.save_aggregated_metrics(method_params)
end
end
it 'raises error for development environment' do
expect { described_class.save_aggregated_metrics(method_params) }.to raise_error /Unsupported data type/
end
end
end
describe '#time_period_to_human_name' do
it 'translates empty time period as all_time' do
expect(described_class.time_period_to_human_name({})).to eql 'all_time'
end
it 'translates time period not longer than 7 days as weekly', :aggregate_failures do
days_6_time_period = 6.days.ago..Date.current
days_7_time_period = 7.days.ago..Date.current
expect(described_class.time_period_to_human_name(column_name: days_6_time_period)).to eql 'weekly'
expect(described_class.time_period_to_human_name(column_name: days_7_time_period)).to eql 'weekly'
end
it 'translates time period longer than 7 days as monthly', :aggregate_failures do
days_8_time_period = 8.days.ago..Date.current
days_31_time_period = 31.days.ago..Date.current
expect(described_class.time_period_to_human_name(column_name: days_8_time_period)).to eql 'monthly'
expect(described_class.time_period_to_human_name(column_name: days_31_time_period)).to eql 'monthly'
end
end
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment