Commit 49df1f27 authored by Robert Speicher's avatar Robert Speicher

Merge branch 'reuse-queries-in-reference-parsers' into 'master'

Re-use queries in reference parsers

This MR changes the reference parsing pipeline to cache queried objects and re-use them on subsequent runs. Data is cached in the `RequestStore` module so it's automatically flushed after a request.

Certain references are processed multiple times. For example, for every system note we check if it's a cross reference by getting the issues, MRs and commits it references. When redacting data we may end up querying these very same objects. By caching this we can quite drastically reduce timings and SQL query counts.

#15607

See merge request !5020
parents 92772f85 9ac4c556
......@@ -20,6 +20,7 @@ v 8.10.0 (unreleased)
- Add Spring EmojiOne updates.
- Fix viewing notification settings when a project is pending deletion
- Fix pagination when sorting by columns with lots of ties (like priority)
- The Markdown reference parsers now re-use query results to prevent running the same queries multiple times !5020
- Updated project header design
- Exclude email check from the standard health check
- Updated layout for Projects, Groups, Users on Admin area !4424
......
......@@ -133,8 +133,9 @@ module Banzai
return {} if nodes.empty?
ids = unique_attribute_values(nodes, attribute)
rows = collection_objects_for_ids(collection, ids)
collection.where(id: ids).each_with_object({}) do |row, hash|
rows.each_with_object({}) do |row, hash|
hash[row.id] = row
end
end
......@@ -153,6 +154,31 @@ module Banzai
values.to_a
end
# Queries the collection for the objects with the given IDs.
#
# If the RequestStore module is enabled this method will only query any
# objects that have not yet been queried. For objects that have already
# been queried the object is returned from the cache.
def collection_objects_for_ids(collection, ids)
if RequestStore.active?
cache = collection_cache[collection_cache_key(collection)]
to_query = ids.map(&:to_i) - cache.keys
unless to_query.empty?
collection.where(id: to_query).each { |row| cache[row.id] = row }
end
cache.values
else
collection.where(id: ids)
end
end
# Returns the cache key to use for a collection.
def collection_cache_key(collection)
collection.respond_to?(:model) ? collection.model : collection
end
# Processes the list of HTML documents and returns an Array containing all
# the references.
def process(documents)
......@@ -189,7 +215,7 @@ module Banzai
end
def find_projects_for_hash_keys(hash)
Project.where(id: hash.keys)
collection_objects_for_ids(Project, hash.keys)
end
private
......@@ -199,6 +225,12 @@ module Banzai
def lazy(&block)
Gitlab::Lazy.new(&block)
end
def collection_cache
RequestStore[:banzai_collection_cache] ||= Hash.new do |hash, key|
hash[key] = {}
end
end
end
end
end
......@@ -73,7 +73,7 @@ module Banzai
def find_users(ids)
return [] if ids.empty?
User.where(id: ids).to_a
collection_objects_for_ids(User, ids)
end
def find_users_for_groups(ids)
......@@ -85,7 +85,8 @@ module Banzai
def find_users_for_projects(ids)
return [] if ids.empty?
Project.where(id: ids).flat_map { |p| p.team.members.to_a }
collection_objects_for_ids(Project, ids).
flat_map { |p| p.team.members.to_a }
end
end
end
......
......@@ -234,4 +234,79 @@ describe Banzai::ReferenceParser::BaseParser, lib: true do
to eq([project])
end
end
describe '#collection_objects_for_ids' do
context 'with RequestStore disabled' do
it 'queries the collection directly' do
collection = User.all
expect(collection).to receive(:where).twice.and_call_original
2.times do
expect(subject.collection_objects_for_ids(collection, [user.id])).
to eq([user])
end
end
end
context 'with RequestStore enabled' do
before do
cache = Hash.new { |hash, key| hash[key] = {} }
allow(RequestStore).to receive(:active?).and_return(true)
allow(subject).to receive(:collection_cache).and_return(cache)
end
it 'queries the collection on the first call' do
expect(subject.collection_objects_for_ids(User, [user.id])).
to eq([user])
end
it 'does not query previously queried objects' do
collection = User.all
expect(collection).to receive(:where).once.and_call_original
2.times do
expect(subject.collection_objects_for_ids(collection, [user.id])).
to eq([user])
end
end
it 'casts String based IDs to Fixnums before querying objects' do
2.times do
expect(subject.collection_objects_for_ids(User, [user.id.to_s])).
to eq([user])
end
end
it 'queries any additional objects after the first call' do
other_user = create(:user)
expect(subject.collection_objects_for_ids(User, [user.id])).
to eq([user])
expect(subject.collection_objects_for_ids(User, [user.id, other_user.id])).
to eq([user, other_user])
end
it 'caches objects on a per collection class basis' do
expect(subject.collection_objects_for_ids(User, [user.id])).
to eq([user])
expect(subject.collection_objects_for_ids(Project, [project.id])).
to eq([project])
end
end
end
describe '#collection_cache_key' do
it 'returns the cache key for a Class' do
expect(subject.collection_cache_key(Project)).to eq(Project)
end
it 'returns the cache key for an ActiveRecord::Relation' do
expect(subject.collection_cache_key(Project.all)).to eq(Project)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment