Commit 0b5654ea authored by James Lopez's avatar James Lopez

Merge branch 'mwaw/caluculate_intersetcion_of_redis_hll_metrics' into 'master'

Calculate intersection of redis hll metrics

See merge request gitlab-org/gitlab!46146
parents 64cdc28c 1a40ddfe
#- name: unique name of aggregated metric
# operator: aggregation operator. Valid values are:
# - "ANY": counts unique elements that were observed triggering any of following events
# - "ALL": counts unique elements that were observed triggering all of following events
# events: list of events names to aggregate into metric. All events in this list must have the same 'redis_slot' and 'aggregation' attributes
# see from lib/gitlab/usage_data_counters/known_events.yml for the list of valid events.
---
- name: product_analytics_test_aggregated_metrics
operator: ANY
events: ['i_search_total', 'i_search_advanced', 'i_search_paid']
- name: product_analytics_test_combined_events
operator: ALL
events: ['i_search_total', 'i_search_advanced', 'i_search_paid']
......@@ -17,8 +17,10 @@ module Gitlab
KNOWN_EVENTS_PATH = File.expand_path('known_events/*.yml', __dir__)
ALLOWED_AGGREGATIONS = %i(daily weekly).freeze
UNION_OF_AGGREGATED_METRICS = 'ANY'
INTERSECTION_OF_AGGREGATED_METRICS = 'ALL'
ALLOWED_METRICS_AGGREGATIONS = [UNION_OF_AGGREGATED_METRICS, INTERSECTION_OF_AGGREGATED_METRICS].freeze
AGGREGATED_METRICS_PATH = File.expand_path('aggregated_metrics/*.yml', __dir__)
ALLOWED_METRICS_AGGREGATIONS = %w[ANY].freeze
# Track event on entity_id
# Increment a Redis HLL counter for unique event_name and entity_id
......@@ -113,9 +115,79 @@ module Gitlab
private
def calculate_count_for_aggregation(aggregation, start_date:, end_date:)
validate_aggregation_operator!(aggregation[:operator])
case aggregation[:operator]
when UNION_OF_AGGREGATED_METRICS
calculate_events_union(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
when INTERSECTION_OF_AGGREGATED_METRICS
calculate_events_intersections(event_names: aggregation[:events], start_date: start_date, end_date: end_date)
else
raise UnknownAggregationOperator, "Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}"
end
end
# calculate intersection of 'n' sets based on inclusion exclusion principle https://en.wikipedia.org/wiki/Inclusion%E2%80%93exclusion_principle
# this method will be extracted to dedicated module with https://gitlab.com/gitlab-org/gitlab/-/issues/273391
def calculate_events_intersections(event_names:, start_date:, end_date:, subset_powers_cache: Hash.new({}))
# calculate power of intersection of all given metrics from inclusion exclusion principle
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C|) =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
# calculate each components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ...
subset_powers_data = subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
# calculate last component of the equation |A & B & C & D| = .... - |A + B + C + D|
power_of_union_of_all_events = begin
subset_powers_cache[event_names.size][event_names.join('_+_')] ||= \
calculate_events_union(event_names: event_names, start_date: start_date, end_date: end_date)
end
# in order to determine if part of equation (|A & B & C|, |A & B & C & D|), that represents the intersection that we need to calculate,
# is positive or negative in particular equation we need to determine if number of subsets is even or odd. Please take a look at two examples below
# |A + B + C| = (|A| + |B| + |C|) - (|A & B| + |A & C| + .. + |C & D|) + |A & B & C| =>
# |A & B & C| = - (|A| + |B| + |C|) + (|A & B| + |A & C| + .. + |C & D|) + |A + B + C|
# |A + B + C + D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A & B & C & D| =>
# |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - |A + B + C + D|
subset_powers_size_even = subset_powers_data.size.even?
# sum all components of equation except for the last one |A & B & C & D| = (|A| + |B| + |C| + |D|) - (|A & B| + |A & C| + .. + |C & D|) + (|A & B & C| + |B & C & D|) - ... =>
sum_of_all_subset_powers = sum_subset_powers(subset_powers_data, subset_powers_size_even)
# add last component of the equation |A & B & C & D| = sum_of_all_subset_powers - |A + B + C + D|
sum_of_all_subset_powers + (subset_powers_size_even ? power_of_union_of_all_events : -power_of_union_of_all_events)
end
def sum_subset_powers(subset_powers_data, subset_powers_size_even)
sum_without_sign = subset_powers_data.to_enum.with_index.sum do |value, index|
(index + 1).odd? ? value : -value
end
count_unique_events(event_names: aggregation[:events], start_date: start_date, end_date: end_date) do |events|
(subset_powers_size_even ? -1 : 1) * sum_without_sign
end
def subsets_intersection_powers(event_names, start_date, end_date, subset_powers_cache)
subset_sizes = (1..(event_names.size - 1))
subset_sizes.map do |subset_size|
if subset_size > 1
# calculate sum of powers of intersection between each subset (with given size) of metrics: #|A + B + C + D| = ... - (|A & B| + |A & C| + .. + |C & D|)
event_names.combination(subset_size).sum do |events_subset|
subset_powers_cache[subset_size][events_subset.join('_&_')] ||= \
calculate_events_intersections(event_names: events_subset, start_date: start_date, end_date: end_date, subset_powers_cache: subset_powers_cache)
end
else
# calculate sum of powers of each set (metric) alone #|A + B + C + D| = (|A| + |B| + |C| + |D|) - ...
event_names.sum do |event|
subset_powers_cache[subset_size][event] ||= \
unique_events(event_names: event, start_date: start_date, end_date: end_date)
end
end
end
end
def calculate_events_union(event_names:, start_date:, end_date:)
count_unique_events(event_names: event_names, start_date: start_date, end_date: end_date) do |events|
raise SlotMismatch, events unless events_in_same_slot?(events)
raise AggregationMismatch, events unless events_same_aggregation?(events)
end
......@@ -226,12 +298,6 @@ module Gitlab
end.flatten
end
def validate_aggregation_operator!(operator)
return true if ALLOWED_METRICS_AGGREGATIONS.include?(operator)
raise UnknownAggregationOperator.new("Events should be aggregated with one of operators #{ALLOWED_METRICS_AGGREGATIONS}")
end
def weekly_redis_keys(events:, start_date:, end_date:)
weeks = end_date.to_date.cweek - start_date.to_date.cweek
weeks = 1 if weeks == 0
......
......@@ -277,29 +277,23 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
end
end
context 'aggregated metrics' do
context 'aggregated_metrics_data' do
let(:known_events) do
[
{ name: 'event1_slot', redis_slot: "slot", category: 'category1', aggregation: "weekly" },
{ name: 'event2_slot', redis_slot: "slot", category: 'category2', aggregation: "weekly" },
{ name: 'event3', category: 'category2', aggregation: "weekly" }
].map(&:with_indifferent_access)
end
let(:aggregated_metrics) do
[
{ name: 'gmau_1', events: %w[event1_slot event2_slot], operator: "ANY" },
{ name: 'gmau_2', events: %w[event3], operator: "ANY" }
{ name: 'event3_slot', redis_slot: "slot", category: 'category3', aggregation: "weekly" },
{ name: 'event5_slot', redis_slot: "slot", category: 'category4', aggregation: "weekly" },
{ name: 'event4', category: 'category2', aggregation: "weekly" }
].map(&:with_indifferent_access)
end
before do
allow(described_class).to receive(:known_events).and_return(known_events)
allow(described_class).to receive(:aggregated_metrics).and_return(aggregated_metrics)
end
shared_examples 'aggregated_metrics_data' do
context 'no combination is tracked' do
context 'no aggregated metrics is defined' do
it 'returns empty hash' do
allow(described_class).to receive(:aggregated_metrics).and_return([])
......@@ -307,14 +301,51 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
end
end
context 'there are some combinations defined' do
it 'returns the number of unique events for all known events' do
results = {
'gmau_1' => 2,
'gmau_2' => 3
}
context 'there are aggregated metrics defined' do
before do
allow(described_class).to receive(:aggregated_metrics).and_return(aggregated_metrics)
end
context 'with ALL operator' do
let(:aggregated_metrics) do
[
{ name: 'gmau_1', events: %w[event1_slot event2_slot], operator: "ALL" },
{ name: 'gmau_2', events: %w[event1_slot event2_slot event3_slot], operator: "ALL" },
{ name: 'gmau_3', events: %w[event1_slot event2_slot event3_slot event5_slot], operator: "ALL" },
{ name: 'gmau_4', events: %w[event4], operator: "ALL" }
].map(&:with_indifferent_access)
end
expect(aggregated_metrics_data).to eq(results)
it 'returns the number of unique events for all known events' do
results = {
'gmau_1' => 3,
'gmau_2' => 2,
'gmau_3' => 1,
'gmau_4' => 3
}
expect(aggregated_metrics_data).to eq(results)
end
end
context 'with ANY operator' do
let(:aggregated_metrics) do
[
{ name: 'gmau_1', events: %w[event3_slot event5_slot], operator: "ANY" },
{ name: 'gmau_2', events: %w[event1_slot event2_slot event3_slot event5_slot], operator: "ANY" },
{ name: 'gmau_3', events: %w[event4], operator: "ANY" }
].map(&:with_indifferent_access)
end
it 'returns the number of unique events for all known events' do
results = {
'gmau_1' => 2,
'gmau_2' => 3,
'gmau_3' => 3
}
expect(aggregated_metrics_data).to eq(results)
end
end
end
end
......@@ -324,16 +355,22 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
before do
described_class.track_event(entity1, 'event1_slot', 2.days.ago)
described_class.track_event(entity2, 'event1_slot', 2.days.ago)
described_class.track_event(entity3, 'event1_slot', 2.days.ago)
described_class.track_event(entity1, 'event2_slot', 2.days.ago)
described_class.track_event(entity2, 'event2_slot', 3.days.ago)
described_class.track_event(entity3, 'event2_slot', 3.days.ago)
described_class.track_event(entity1, 'event3_slot', 3.days.ago)
described_class.track_event(entity2, 'event3_slot', 3.days.ago)
described_class.track_event(entity2, 'event5_slot', 3.days.ago)
# events out of time scope
described_class.track_event(entity3, 'event2_slot', 8.days.ago)
# events in different slots
described_class.track_event(entity1, 'event3', 2.days.ago)
described_class.track_event(entity2, 'event3', 2.days.ago)
described_class.track_event(entity4, 'event3', 2.days.ago)
described_class.track_event(entity1, 'event4', 2.days.ago)
described_class.track_event(entity2, 'event4', 2.days.ago)
described_class.track_event(entity4, 'event4', 2.days.ago)
end
it_behaves_like 'aggregated_metrics_data'
......@@ -342,21 +379,58 @@ RSpec.describe Gitlab::UsageDataCounters::HLLRedisCounter, :clean_gitlab_redis_s
describe '.aggregated_metrics_monthly_data' do
subject(:aggregated_metrics_data) { described_class.aggregated_metrics_monthly_data }
before do
described_class.track_event(entity1, 'event1_slot', 2.days.ago)
described_class.track_event(entity1, 'event2_slot', 10.days.ago)
described_class.track_event(entity3, 'event2_slot', 4.weeks.ago.advance(days: 1))
it_behaves_like 'aggregated_metrics_data' do
before do
described_class.track_event(entity1, 'event1_slot', 2.days.ago)
described_class.track_event(entity2, 'event1_slot', 2.days.ago)
described_class.track_event(entity3, 'event1_slot', 2.days.ago)
described_class.track_event(entity1, 'event2_slot', 2.days.ago)
described_class.track_event(entity2, 'event2_slot', 3.days.ago)
described_class.track_event(entity3, 'event2_slot', 3.days.ago)
described_class.track_event(entity1, 'event3_slot', 3.days.ago)
described_class.track_event(entity2, 'event3_slot', 10.days.ago)
described_class.track_event(entity2, 'event5_slot', 4.weeks.ago.advance(days: 1))
# events out of time scope
described_class.track_event(entity1, 'event5_slot', 4.weeks.ago.advance(days: -1))
# events in different slots
described_class.track_event(entity1, 'event4', 2.days.ago)
described_class.track_event(entity2, 'event4', 2.days.ago)
described_class.track_event(entity4, 'event4', 2.days.ago)
end
end
# events out of time scope
described_class.track_event(entity3, 'event2_slot', 4.weeks.ago.advance(days: -1))
context 'Redis calls' do
let(:aggregated_metrics) do
[
{ name: 'gmau_3', events: %w[event1_slot event2_slot event3_slot event5_slot], operator: "ALL" }
].map(&:with_indifferent_access)
end
# events in different slots
described_class.track_event(entity1, 'event3', 2.days.ago)
described_class.track_event(entity2, 'event3', 2.days.ago)
described_class.track_event(entity4, 'event3', 2.days.ago)
end
let(:known_events) do
[
{ name: 'event1_slot', redis_slot: "slot", category: 'category1', aggregation: "weekly" },
{ name: 'event2_slot', redis_slot: "slot", category: 'category2', aggregation: "weekly" },
{ name: 'event3_slot', redis_slot: "slot", category: 'category3', aggregation: "weekly" },
{ name: 'event5_slot', redis_slot: "slot", category: 'category4', aggregation: "weekly" }
].map(&:with_indifferent_access)
end
it_behaves_like 'aggregated_metrics_data'
it 'caches intermediate operations' do
allow(described_class).to receive(:known_events).and_return(known_events)
allow(described_class).to receive(:aggregated_metrics).and_return(aggregated_metrics)
4.downto(1) do |subset_size|
known_events.combination(subset_size).each do |events|
keys = described_class.send(:weekly_redis_keys, events: events, start_date: 4.weeks.ago.to_date, end_date: Date.current)
expect(Gitlab::Redis::HLL).to receive(:count).with(keys: keys).once.and_return(0)
end
end
subject
end
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment