Commit 7ead5a08 authored by Mikolaj Wawrzyniak's avatar Mikolaj Wawrzyniak

Apply Postgres HLL counter to security jobs

Use new Distributed Postgres HLL counter to calculate challanging
metrics for Usage Ping
parent eb94430f
---
name: postgres_hll_batch_counting
introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/48233
rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/285485
milestone: '13.7'
type: development
group: group::product analytics
default_enabled: false
...@@ -396,16 +396,44 @@ module EE ...@@ -396,16 +396,44 @@ module EE
def count_secure_pipelines(time_period) def count_secure_pipelines(time_period)
return {} if time_period.blank? return {} if time_period.blank?
start = ::Ci::Pipeline.minimum(:id)
finish = ::Ci::Pipeline.maximum(:id)
pipelines_with_secure_jobs = {} pipelines_with_secure_jobs = {}
::Security::Scan.scan_types.each do |name, scan_type| # HLL batch counting always iterate over pkey of
relation = ::Ci::Build.joins(:security_scans) # given relation, while ordinary batch count
.where(status: 'success', retried: [nil, false]) # iterated over counted attribute, one-to-many joins
.where('security_scans.scan_type = ?', scan_type) # can break batch size limitation, and lead to
.where(time_period) # time outing batch queries, to avoid that
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] = distinct_count(relation, :commit_id, start: start, finish: finish, batch: false) # different join strategy is used for HLL counter
if ::Feature.enabled?(:postgres_hll_batch_counting)
relation = ::Security::Scan.where(time_period).group(:created_at)
start = relation.select('MIN(id) as min_id').order('min_id ASC').first&.min_id
finish = relation.select('MAX(id) as max_id').order('max_id DESC').first&.max_id
::Security::Scan.scan_types.each do |name, scan_type|
relation = ::Security::Scan.joins(:build)
.where(ci_builds: { status: 'success', retried: [nil, false] })
.where('security_scans.scan_type = ?', scan_type)
.where(security_scans: time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] =
if start && finish
estimate_batch_distinct_count(relation, :commit_id, batch_size: 1000, start: start, finish: finish)
else
0
end
end
else
start = ::Ci::Pipeline.minimum(:id)
finish = ::Ci::Pipeline.maximum(:id)
::Security::Scan.scan_types.each do |name, scan_type|
relation = ::Ci::Build.joins(:security_scans)
.where(status: 'success', retried: [nil, false])
.where('security_scans.scan_type = ?', scan_type)
.where(time_period)
pipelines_with_secure_jobs["#{name}_pipeline".to_sym] = distinct_count(relation, :commit_id, start: start, finish: finish, batch: false)
end
end end
pipelines_with_secure_jobs pipelines_with_secure_jobs
......
This diff is collapsed.
...@@ -28,13 +28,14 @@ module Gitlab ...@@ -28,13 +28,14 @@ module Gitlab
# for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3% # for given implementation no higher value was reported (https://gitlab.com/gitlab-org/gitlab/-/merge_requests/45673#accuracy-estimation) than 5.3%
# for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used. # for the most of a cases this value is lower. However, if the exact value is necessary other tools has to be used.
class PostgresHllBatchDistinctCounter class PostgresHllBatchDistinctCounter
ERROR_RATE = 4.9 # max encountered empirical error rate, used in tests
FALLBACK = -1 FALLBACK = -1
MIN_REQUIRED_BATCH_SIZE = 1_250 MIN_REQUIRED_BATCH_SIZE = 750
MAX_ALLOWED_LOOPS = 10_000
SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep SLEEP_TIME_IN_SECONDS = 0.01 # 10 msec sleep
MAX_DATA_VOLUME = 4_000_000_000
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705 # Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE = 100_000 DEFAULT_BATCH_SIZE = 10_000
BIT_31_MASK = "B'0#{'1' * 31}'" BIT_31_MASK = "B'0#{'1' * 31}'"
BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'" BIT_9_MASK = "B'#{'0' * 23}#{'1' * 9}'"
...@@ -49,7 +50,7 @@ module Gitlab ...@@ -49,7 +50,7 @@ module Gitlab
SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num, SELECT (attr_hash_32_bits & #{BIT_9_MASK})::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash (31 - floor(log(2, min((attr_hash_32_bits & #{BIT_31_MASK})::int))))::int as bucket_hash
FROM hashed_attributes FROM hashed_attributes
GROUP BY 1 ORDER BY 1 GROUP BY 1
SQL SQL
TOTAL_BUCKETS_NUMBER = 512 TOTAL_BUCKETS_NUMBER = 512
...@@ -61,7 +62,7 @@ module Gitlab ...@@ -61,7 +62,7 @@ module Gitlab
def unwanted_configuration?(finish, batch_size, start) def unwanted_configuration?(finish, batch_size, start)
batch_size <= MIN_REQUIRED_BATCH_SIZE || batch_size <= MIN_REQUIRED_BATCH_SIZE ||
(finish - start) / batch_size >= MAX_ALLOWED_LOOPS || (finish - start) >= MAX_DATA_VOLUME ||
start > finish start > finish
end end
...@@ -101,7 +102,7 @@ module Gitlab ...@@ -101,7 +102,7 @@ module Gitlab
num_uniques = ( num_uniques = (
((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) / ((TOTAL_BUCKETS_NUMBER**2) * (0.7213 / (1 + 1.079 / TOTAL_BUCKETS_NUMBER))) /
(num_zero_buckets + hll_blob.values.sum { |bucket_hash, _| 2**(-1 * bucket_hash)} ) (num_zero_buckets + hll_blob.values.sum { |bucket_hash| 2**(-1 * bucket_hash)} )
).to_i ).to_i
if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER if num_zero_buckets > 0 && num_uniques < 2.5 * TOTAL_BUCKETS_NUMBER
......
...@@ -25,6 +25,13 @@ module Gitlab ...@@ -25,6 +25,13 @@ module Gitlab
relation.select(relation.all.table[column].sum).to_sql relation.select(relation.all.table[column].sum).to_sql
end end
# For estimated distinct count use exact query instead of hll
# buckets query, because it can't be used to obtain estimations without
# supplementary ruby code present in Gitlab::Database::PostgresHllBatchDistinctCounter
def estimate_batch_distinct_count(relation, column = nil, *rest)
raw_sql(relation, column, :distinct)
end
private private
def raw_sql(relation, column, distinct = nil) def raw_sql(relation, column, distinct = nil)
......
...@@ -38,6 +38,7 @@ module Gitlab ...@@ -38,6 +38,7 @@ module Gitlab
extend self extend self
FALLBACK = -1 FALLBACK = -1
DISTRIBUTED_HLL_FALLBACK = -2
def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil) def count(relation, column = nil, batch: true, batch_size: nil, start: nil, finish: nil)
if batch if batch
...@@ -59,6 +60,17 @@ module Gitlab ...@@ -59,6 +60,17 @@ module Gitlab
FALLBACK FALLBACK
end end
def estimate_batch_distinct_count(relation, column = nil, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::PostgresHllBatchDistinctCounter.new(relation, column).estimate_distinct_count(batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid
FALLBACK
# catch all rescue should be removed as a part of feature flag rollout issue
# https://gitlab.com/gitlab-org/gitlab/-/issues/285485
rescue StandardError => error
Gitlab::ErrorTracking.track_and_raise_for_dev_exception(error)
DISTRIBUTED_HLL_FALLBACK
end
def sum(relation, column, batch_size: nil, start: nil, finish: nil) def sum(relation, column, batch_size: nil, start: nil, finish: nil)
Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish) Gitlab::Database::BatchCount.batch_sum(relation, column, batch_size: batch_size, start: start, finish: finish)
rescue ActiveRecord::StatementInvalid rescue ActiveRecord::StatementInvalid
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
require 'spec_helper' require 'spec_helper'
RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
let_it_be(:error_rate) { 4.9 } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin let_it_be(:error_rate) { described_class::ERROR_RATE } # HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK } let_it_be(:fallback) { ::Gitlab::Database::BatchCounter::FALLBACK }
let_it_be(:small_batch_size) { calculate_batch_size(::Gitlab::Database::BatchCounter::MIN_REQUIRED_BATCH_SIZE) } let_it_be(:small_batch_size) { calculate_batch_size(described_class::MIN_REQUIRED_BATCH_SIZE) }
let(:model) { Issue } let(:model) { Issue }
let(:column) { :author_id } let(:column) { :author_id }
...@@ -118,8 +118,8 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do ...@@ -118,8 +118,8 @@ RSpec.describe Gitlab::Database::PostgresHllBatchDistinctCounter do
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback) expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: 0)).to eq(fallback)
end end
it 'returns fallback if loops more than allowed' do it 'returns fallback if data volume exceeds upper limit' do
large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_ALLOWED_LOOPS * default_batch_size + 1 large_finish = Gitlab::Database::PostgresHllBatchDistinctCounter::MAX_DATA_VOLUME + 1
expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback) expect(described_class.new(model, column).estimate_distinct_count(start: 1, finish: large_finish)).to eq(fallback)
end end
......
...@@ -37,6 +37,36 @@ RSpec.describe Gitlab::Utils::UsageData do ...@@ -37,6 +37,36 @@ RSpec.describe Gitlab::Utils::UsageData do
end end
end end
describe '#estimate_batch_distinct_count' do
let(:relation) { double(:relation) }
it 'delegates counting to counter class instance' do
expect_next_instance_of(Gitlab::Database::PostgresHllBatchDistinctCounter, relation, 'column') do |instance|
expect(instance).to receive(:estimate_distinct_count)
.with(batch_size: nil, start: nil, finish: nil)
.and_return(5)
end
expect(described_class.estimate_batch_distinct_count(relation, 'column')).to eq(5)
end
it 'returns default fallback value when counting fails due to database error' do
stub_const("Gitlab::Utils::UsageData::FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(ActiveRecord::StatementInvalid.new(''))
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
it 'logs error and returns DISTRIBUTED_HLL_FALLBACK value when counting raises any error', :aggregate_failures do
error = StandardError.new('')
stub_const("Gitlab::Utils::UsageData::DISTRIBUTED_HLL_FALLBACK", 15)
allow(Gitlab::Database::PostgresHllBatchDistinctCounter).to receive(:new).and_raise(error)
expect(Gitlab::ErrorTracking).to receive(:track_and_raise_for_dev_exception).with(error)
expect(described_class.estimate_batch_distinct_count(relation)).to eq(15)
end
end
describe '#sum' do describe '#sum' do
let(:relation) { double(:relation) } let(:relation) { double(:relation) }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment