Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
G
gitlab-ce
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
nexedi
gitlab-ce
Commits
636fb4c9
Commit
636fb4c9
authored
Oct 20, 2020
by
Mikolaj Wawrzyniak
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Distribute hll batch counter
parent
c0f7c071
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
298 additions
and
0 deletions
+298
-0
lib/gitlab/database/postgres_hll_batch_distinct_count.rb
lib/gitlab/database/postgres_hll_batch_distinct_count.rb
+136
-0
spec/lib/gitlab/database/postgres_hll_batch_distinct_count_spec.rb
...gitlab/database/postgres_hll_batch_distinct_count_spec.rb
+162
-0
No files found.
lib/gitlab/database/postgres_hll_batch_distinct_count.rb
0 → 100644
View file @
636fb4c9
# frozen_string_literal: true
module
Gitlab
module
Database
module
PostgresHllBatchDistinctCount
def
batch_distinct_count
(
relation
,
column
=
nil
,
batch_size:
nil
,
start:
nil
,
finish:
nil
)
PostgresHllBatchDistinctCounter
.
new
(
relation
,
column:
column
).
count
(
batch_size:
batch_size
,
start:
start
,
finish:
finish
)
end
class
<<
self
include
PostgresHllBatchDistinctCount
end
end
class
PostgresHllBatchDistinctCounter
FALLBACK
=
-
1
MIN_REQUIRED_BATCH_SIZE
=
1_250
MAX_ALLOWED_LOOPS
=
10_000
SLEEP_TIME_IN_SECONDS
=
0.01
# 10 msec sleep
# Each query should take < 500ms https://gitlab.com/gitlab-org/gitlab/-/merge_requests/22705
DEFAULT_BATCH_SIZE
=
100_000
BIT_31_MASK
=
"B'0
#{
'1'
*
31
}
'"
BIT_9_MASK
=
"B'
#{
'0'
*
23
}#{
'1'
*
9
}
'"
# source_query:
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
BUCKETED_DATA_SQL
=
<<~
SQL
WITH hashed_attributes AS (%{source_query})
SELECT (attr_hash_32_bits &
#{
BIT_9_MASK
}
)::int AS bucket_num,
(31 - floor(log(2, min((attr_hash_32_bits &
#{
BIT_31_MASK
}
)::int))))::int as bucket_hash
FROM hashed_attributes
GROUP BY 1 ORDER BY 1
SQL
def
initialize
(
relation
,
column:
nil
,
operation_args:
nil
)
@relation
=
relation
@column
=
column
||
relation
.
primary_key
@operation_args
=
operation_args
end
def
unwanted_configuration?
(
finish
,
batch_size
,
start
)
batch_size
<=
MIN_REQUIRED_BATCH_SIZE
||
(
finish
-
start
)
/
batch_size
>=
MAX_ALLOWED_LOOPS
||
start
>
finish
end
def
count
(
batch_size:
nil
,
start:
nil
,
finish:
nil
)
raise
'BatchCount can not be run inside a transaction'
if
ActiveRecord
::
Base
.
connection
.
transaction_open?
batch_size
||=
DEFAULT_BATCH_SIZE
start
=
actual_start
(
start
)
finish
=
actual_finish
(
finish
)
raise
"Batch counting expects positive values only for
#{
@column
}
"
if
start
<
0
||
finish
<
0
return
FALLBACK
if
unwanted_configuration?
(
finish
,
batch_size
,
start
)
batch_start
=
start
hll_blob
=
{}
while
batch_start
<=
finish
begin
hll_blob
.
merge!
(
hll_blob_for_batch
(
batch_start
,
batch_start
+
batch_size
))
{
|
_key
,
old
,
new
|
new
>
old
?
new
:
old
}
batch_start
+=
batch_size
rescue
ActiveRecord
::
QueryCanceled
# retry with a safe batch size & warmer cache
if
batch_size
>=
2
*
MIN_REQUIRED_BATCH_SIZE
batch_size
/=
2
else
return
FALLBACK
end
end
sleep
(
SLEEP_TIME_IN_SECONDS
)
end
estimate_cardinality
(
hll_blob
)
end
private
def
estimate_cardinality
(
hll_blob
)
num_zero_buckets
=
512
-
hll_blob
.
size
num_uniques
=
(
((
512
**
2
)
*
(
0.7213
/
(
1
+
1.079
/
512
)))
/
(
num_zero_buckets
+
hll_blob
.
values
.
sum
{
|
bucket_hash
,
_
|
2
**
(
-
1
*
bucket_hash
)}
)
).
to_i
if
num_zero_buckets
>
0
&&
num_uniques
<
2.5
*
512
((
0.7213
/
(
1
+
1.079
/
512
))
*
(
512
*
Math
.
log2
(
512.0
/
num_zero_buckets
)))
else
num_uniques
end
end
def
hll_blob_for_batch
(
start
,
finish
)
@relation
.
connection
.
execute
(
BUCKETED_DATA_SQL
%
{
source_query:
source_query
(
start
,
finish
)
})
.
map
(
&
:values
)
.
to_h
end
# SELECT CAST(('X' || md5(CAST(%{column} as text))) as bit(32)) attr_hash_32_bits
# FROM %{relation}
# WHERE %{pkey} >= %{batch_start} AND %{pkey} < %{batch_end}
# AND %{column} IS NOT NULL
def
source_query
(
start
,
finish
)
col_as_arel
=
@column
.
is_a?
(
Arel
::
Attributes
::
Attribute
)
?
@column
:
Arel
.
sql
(
@column
.
to_s
)
col_as_text
=
Arel
::
Nodes
::
NamedFunction
.
new
(
'CAST'
,
[
col_as_arel
.
as
(
'text'
)])
md5_of_col
=
Arel
::
Nodes
::
NamedFunction
.
new
(
'md5'
,
[
col_as_text
])
md5_as_hex
=
Arel
::
Nodes
::
Concat
.
new
(
Arel
.
sql
(
"'X'"
),
md5_of_col
)
bits
=
Arel
::
Nodes
::
NamedFunction
.
new
(
'CAST'
,
[
md5_as_hex
.
as
(
'bit(32)'
)])
@relation
.
where
(
@relation
.
primary_key
=>
(
start
...
finish
))
.
where
(
col_as_arel
.
not_eq
(
nil
))
.
select
(
bits
.
as
(
'attr_hash_32_bits'
)).
to_sql
end
def
actual_start
(
start
)
start
||
@relation
.
unscope
(
:group
,
:having
).
minimum
(
@relation
.
primary_key
)
||
0
end
def
actual_finish
(
finish
)
finish
||
@relation
.
unscope
(
:group
,
:having
).
maximum
(
@relation
.
primary_key
)
||
0
end
end
end
end
spec/lib/gitlab/database/postgres_hll_batch_distinct_count_spec.rb
0 → 100644
View file @
636fb4c9
# frozen_string_literal: true
require
'spec_helper'
RSpec
.
describe
Gitlab
::
Database
::
PostgresHllBatchDistinctCount
do
let_it_be
(
:error_rate
)
{
4.9
}
# HyperLogLog is a probabilistic algorithm, which provides estimated data, with given error margin
let_it_be
(
:fallback
)
{
::
Gitlab
::
Database
::
BatchCounter
::
FALLBACK
}
let_it_be
(
:small_batch_size
)
{
calculate_batch_size
(
::
Gitlab
::
Database
::
BatchCounter
::
MIN_REQUIRED_BATCH_SIZE
)
}
let
(
:model
)
{
Issue
}
let
(
:column
)
{
:author_id
}
let
(
:in_transaction
)
{
false
}
let_it_be
(
:user
)
{
create
(
:user
,
email:
'email1@domain.com'
)
}
let_it_be
(
:another_user
)
{
create
(
:user
,
email:
'email2@domain.com'
)
}
def
calculate_batch_size
(
batch_size
)
zero_offset_modifier
=
-
1
batch_size
+
zero_offset_modifier
end
before
do
allow
(
ActiveRecord
::
Base
.
connection
).
to
receive
(
:transaction_open?
).
and_return
(
in_transaction
)
end
context
'different distribution of relation records'
do
[
10
,
100
,
100_000
].
each
do
|
spread
|
context
"records are spread within
#{
spread
}
"
do
before
do
ids
=
(
1
..
spread
).
to_a
.
sample
(
10
)
create_list
(
:issue
,
10
).
each_with_index
do
|
issue
,
i
|
issue
.
id
=
ids
[
i
]
end
end
it
'counts table'
do
expect
(
described_class
.
batch_distinct_count
(
model
)).
to
be_within
(
error_rate
).
percent_of
(
10
)
end
end
end
end
context
'unit test for different counting parameters'
do
before_all
do
create_list
(
:issue
,
3
,
author:
user
)
create_list
(
:issue
,
2
,
author:
another_user
)
end
shared_examples
'disallowed configurations'
do
|
method
|
it
'returns fallback if start is bigger than finish'
do
expect
(
described_class
.
public_send
(
method
,
*
args
,
start:
1
,
finish:
0
)).
to
eq
(
fallback
)
end
it
'returns fallback if loops more than allowed'
do
large_finish
=
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
MAX_ALLOWED_LOOPS
*
default_batch_size
+
1
expect
(
described_class
.
public_send
(
method
,
*
args
,
start:
1
,
finish:
large_finish
)).
to
eq
(
fallback
)
end
it
'returns fallback if batch size is less than min required'
do
expect
(
described_class
.
public_send
(
method
,
*
args
,
batch_size:
small_batch_size
)).
to
eq
(
fallback
)
end
end
shared_examples
'when a transaction is open'
do
let
(
:in_transaction
)
{
true
}
it
'raises an error'
do
expect
{
subject
}.
to
raise_error
(
'BatchCount can not be run inside a transaction'
)
end
end
shared_examples
'when batch fetch query is canceled'
do
let
(
:batch_size
)
{
22_000
}
it
'reduces batch size by half and retry fetch'
do
allow
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
0
..
calculate_batch_size
(
batch_size
)).
and_raise
(
ActiveRecord
::
QueryCanceled
)
expect
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
0
..
calculate_batch_size
(
batch_size
/
2
)).
and_call_original
subject
.
call
(
model
,
column
,
batch_size:
batch_size
,
start:
0
)
end
end
describe
'#batch_distinct_count'
do
it
'counts table'
do
expect
(
described_class
.
batch_distinct_count
(
model
)).
to
be_within
(
error_rate
).
percent_of
(
5
)
end
it
'counts with column field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
)).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
it
'counts with :id field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
:id
)).
to
be_within
(
error_rate
).
percent_of
(
5
)
end
it
'counts with "id" field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
"id"
)).
to
be_within
(
error_rate
).
percent_of
(
5
)
end
it
'counts with table.column field'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
"
#{
model
.
table_name
}
.
#{
column
}
"
)).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
it
'counts with Arel column'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
model
.
arel_table
[
column
])).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
it
'counts over joined relations'
do
expect
(
described_class
.
batch_distinct_count
(
model
.
joins
(
:author
),
"users.email"
)).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
it
'counts with :column field with batch_size of 50K'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
,
batch_size:
50_000
)).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
it
'will not count table with a batch size less than allowed'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
,
batch_size:
small_batch_size
)).
to
eq
(
fallback
)
end
it
'counts with different number of batches and aggregates total result'
do
stub_const
(
'Gitlab::Database::PostgresHllBatchDistinctCounter::MIN_REQUIRED_BATCH_SIZE'
,
0
)
[
1
,
2
,
4
,
5
,
6
].
each
{
|
i
|
expect
(
described_class
.
batch_distinct_count
(
model
,
batch_size:
i
)).
to
be_within
(
error_rate
).
percent_of
(
5
)
}
end
it
'counts with a start and finish'
do
expect
(
described_class
.
batch_distinct_count
(
model
,
column
,
start:
model
.
minimum
(
:id
),
finish:
model
.
maximum
(
:id
))).
to
be_within
(
error_rate
).
percent_of
(
2
)
end
it
"defaults the batch size to
#{
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
}
"
do
min_id
=
model
.
minimum
(
:id
)
batch_end_id
=
min_id
+
calculate_batch_size
(
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
)
expect
(
model
).
to
receive
(
:where
).
with
(
"id"
=>
min_id
..
batch_end_id
).
and_call_original
described_class
.
batch_distinct_count
(
model
)
end
it_behaves_like
'when a transaction is open'
do
subject
{
described_class
.
batch_distinct_count
(
model
,
column
)
}
end
context
'disallowed configurations'
do
include_examples
'disallowed configurations'
,
:batch_distinct_count
do
let
(
:args
)
{
[
model
,
column
]
}
let
(
:default_batch_size
)
{
Gitlab
::
Database
::
PostgresHllBatchDistinctCounter
::
DEFAULT_BATCH_SIZE
}
end
end
it_behaves_like
'when batch fetch query is canceled'
do
let
(
:mode
)
{
:distinct
}
let
(
:operation
)
{
:count
}
let
(
:operation_args
)
{
nil
}
let
(
:column
)
{
nil
}
subject
{
described_class
.
method
(
:batch_distinct_count
)
}
end
end
end
end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment