Commit 7ead5a08 authored by Mikolaj Wawrzyniak's avatar Mikolaj Wawrzyniak

Apply Postgres HLL counter to security jobs

Use new Distributed Postgres HLL counter to calculate challanging
metrics for Usage Ping
parent eb94430f
---
name: postgres_hll_batch_counting
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/48233
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/285485
milestone: '13.7'
type: development
group: group::product analytics
default_enabled: false
......@@ -396,9 +396,36 @@ module EE
def count_secure_pipelines(time_period)
return {} if time_period.blank?
pipelines_with_secure_jobs = {}
# HLL batch counting always iterate over pkey of
# given relation, while ordinary batch count
# iterated over counted attribute, one-to-many joins
# can break batch size limitation, and lead to
# time outing batch queries, to avoid that
# different join strategy is used for HLL counter
if ::Feature.enabled?(:postgres_hll_batch_counting)
relation = ::Security::Scan.where(time_period).group(:created_at)
start = relation.select('MIN(id) as min_id').order('min_id ASC').first&.min_id
finish = relation.select('MAX(id) as max_id').order('max_id DESC').first&.max_id
::Security::Scan.scan_types.each do |name, scan_type|
relation = ::Security::Scan.joins(:build)
.where(ci_builds: { status: 'success', retried: [nil, false] })
.where('security_scans.scan_type = ?', scan_type)
.where(security_scans: time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] =
if start && finish
estimate_batch_distinct_count(relation, :commit_id, batch_size: 1000, start: start, finish: finish)
else
0
end
end
else
start = ::Ci::Pipeline.minimum(:id)
finish = ::Ci::Pipeline.maximum(:id)
pipelines_with_secure_jobs = {}
::Security::Scan.scan_types.each do |name, scan_type|
relation = ::Ci::Build.joins(:security_scans)
......@@ -407,6 +434,7 @@ module EE
.where(time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] = distinct_count(relation, :commit_id, start: start, finish: finish, batch: false)
end
end
pipelines_with_secure_jobs
end
......
This diff is collapsed.
......@@ -28,13 +28,14 @@ module Gitlab
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class PostgresHllBatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 1_250
MAX_ALLOWED_LOOPS = 10_000
MIN_REQUIRED_BATCH_SIZE = 750
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 100_000
DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
......@@ -49,7 +50,7 @@ module Gitlab
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1 ORDER BY 1
GROUP BY 1
SQL
TOTAL_BUCKETS_NUMBER = 512
......@@ -61,7 +62,7 @@ module Gitlab
def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) / batch_size >= MAX_ALLOWED_LOOPS ||
(finish - start) >= MAX_DATA_VOLUME ||
start > finish
end
......@@ -101,7 +102,7 @@ module Gitlab
num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash, _| 2**(-1 * bucket_hash)} )
(num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
......
......@@ -25,6 +25,13 @@ module Gitlab
relation.select(relation.all.table[column].sum).to_sql
end
# For estimated distinct count use exact query instead of hll
# buckets query, because it can't be used to obtain estimations without
# supplementary ruby code present in Gitlab::Database::PostgresHllBatchDistinctCounter
def estimate_batch_distinct_count(relation, column = nil, *rest)
raw_sql(relation, column, :distinct)
end
private
def raw_sql(relation, column, distinct = nil)
......
......@@ -38,6 +38,7 @@ module Gitlab
extend self
FALLBACK = -1
DISTRIBUTED_HLL_FALLBACK = -2
def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil)
if batch
......@@ -59,6 +60,17 @@ module Gitlab
FALLBACK
end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHllBatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
# https://gitlab.com/gitlab-org/gitlab/-/issues/285485
rescue StandardError => error
Gitlab::ErrorTracking.track_and_raise_for_dev_exception(error)
DISTRIBUTED_HLL_FALLBACK
end
def sum(relation, column, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid
......
......@@ -3,9 +3,9 @@
require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
let_it_be(:error_rate) { 4.9 } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:error_rate) { described_class::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
let_it_be(:small_batch_size) { calculate_batch_size(::Gitlab::Database::BatchCounter::MIN_REQUIRED_BATCH_SIZE) }
let_it_be(:small_batch_size) { calculate_batch_size(described_class::MIN_REQUIRED_BATCH_SIZE) }
let(:model) { Issue }
let(:column) { :author_id }
......@@ -118,8 +118,8 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
end
it 'returns fallback if loops more than allowed' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_ALLOWED_LOOPS * default_batch_size + 1
it 'returns fallback if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
end
......
......@@ -37,6 +37,36 @@ RSpec.describe Gitlab::Utils::UsageData do
end
end
describe '#estimate_batch_distinct_count' do
let(:relation) { double(:relation) }
it 'delegates counting to counter class instance' do
expect_next_instance_of(Gitlab::Database::PostgresHllBatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
end
expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
end
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
end
describe '#sum' do
let(:relation) { double(:relation) }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment