Commit b4a2ba41 authored by Kamil Trzciński's avatar Kamil Trzciński

Merge branch '27070-dedup-import-json' into 'master'

De-dup project tree entries

Closes #27070

See merge request gitlab-org/gitlab!22598
parents f028d34b 499be639
# frozen_string_literal: true
module Gitlab
module ImportExport
class ProjectTreeLoader
def load(path, dedup_entries: false)
tree_hash = ActiveSupport::JSON.decode(IO.read(path))
if dedup_entries
dedup_tree(tree_hash)
else
tree_hash
end
end
private
# This function removes duplicate entries from the given tree recursively
# by caching nodes it encounters repeatedly. We only consider nodes for
# which there can actually be multiple equivalent instances (e.g. strings,
# hashes and arrays, but not `nil`s, numbers or booleans.)
#
# The algorithm uses a recursive depth-first descent with 3 cases, starting
# with a root node (the tree/hash itself):
# - a node has already been cached; in this case we return it from the cache
# - a node has not been cached yet but should be; descend into its children
# - a node is neither cached nor qualifies for caching; this is a no-op
def dedup_tree(node, nodes_seen = {})
if nodes_seen.key?(node) && distinguishable?(node)
yield nodes_seen[node]
elsif should_dedup?(node)
nodes_seen[node] = node
case node
when Array
node.each_index do |idx|
dedup_tree(node[idx], nodes_seen) do |cached_node|
node[idx] = cached_node
end
end
when Hash
node.each do |k, v|
dedup_tree(v, nodes_seen) do |cached_node|
node[k] = cached_node
end
end
end
else
node
end
end
# We do not need to consider nodes for which there cannot be multiple instances
def should_dedup?(node)
node && !(node.is_a?(Numeric) || node.is_a?(TrueClass) || node.is_a?(FalseClass))
end
# We can only safely de-dup values that are distinguishable. True value objects
# are always distinguishable by nature. Hashes however can represent entities,
# which are identified by ID, not value. We therefore disallow de-duping hashes
# that do not have an `id` field, since we might risk dropping entities that
# have equal attributes yet different identities.
def distinguishable?(node)
if node.is_a?(Hash)
node.key?('id')
else
true
end
end
end
end
end
......@@ -3,15 +3,17 @@
module Gitlab
module ImportExport
class ProjectTreeRestorer
LARGE_PROJECT_FILE_SIZE_BYTES = 500.megabyte
attr_reader :user
attr_reader :shared
attr_reader :project
def initialize(user:, shared:, project:)
@path = File.join(shared.export_path, 'project.json')
@user = user
@shared = shared
@project = project
@tree_loader = ProjectTreeLoader.new
end
def restore
......@@ -36,9 +38,16 @@ module Gitlab
private
def large_project?(path)
File.size(path) >= LARGE_PROJECT_FILE_SIZE_BYTES
end
def read_tree_hash
json = IO.read(@path)
ActiveSupport::JSON.decode(json)
path = File.join(@shared.export_path, 'project.json')
dedup_entries = large_project?(path) &&
Feature.enabled?(:dedup_project_import_metadata, project.group)
@tree_loader.load(path, dedup_entries: dedup_entries)
rescue => e
Rails.logger.error("Import/Export error: #{e.message}") # rubocop:disable Gitlab/RailsLogger
raise Gitlab::ImportExport::Error.new('Incorrect JSON format')
......
......@@ -159,7 +159,7 @@ module Gitlab
def build_relation(relation_key, relation_definition, data_hash)
# TODO: This is hack to not create relation for the author
# Rather make `RelationFactory#set_note_author` to take care of that
return data_hash if relation_key == 'author'
return data_hash if relation_key == 'author' || already_restored?(data_hash)
# create relation objects recursively for all sub-objects
relation_definition.each do |sub_relation_key, sub_relation_definition|
......@@ -169,6 +169,13 @@ module Gitlab
@relation_factory.create(relation_factory_params(relation_key, data_hash))
end
# Since we update the data hash in place as we restore relation items,
# and since we also de-duplicate items, we might encounter items that
# have already been restored in a previous iteration.
def already_restored?(relation_item)
!relation_item.is_a?(Hash)
end
def transform_sub_relations!(data_hash, sub_relation_key, sub_relation_definition)
sub_data_hash = data_hash[sub_relation_key]
return unless sub_data_hash
......
{
"simple": 42,
"duped_hash_with_id": {
"id": 0,
"v1": 1
},
"duped_hash_no_id": {
"v1": 1
},
"duped_array": [
"v2"
],
"array": [
{
"duped_hash_with_id": {
"id": 0,
"v1": 1
}
},
{
"duped_array": [
"v2"
]
},
{
"duped_hash_no_id": {
"v1": 1
}
}
],
"nested": {
"duped_hash_with_id": {
"id": 0,
"v1": 1
},
"duped_array": [
"v2"
],
"array": [
"don't touch"
]
}
}
\ No newline at end of file
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::ImportExport::ProjectTreeLoader do
let(:fixture) { 'spec/fixtures/lib/gitlab/import_export/with_duplicates.json' }
let(:project_tree) { JSON.parse(File.read(fixture)) }
context 'without de-duplicating entries' do
let(:parsed_tree) do
subject.load(fixture)
end
it 'parses the JSON into the expected tree' do
expect(parsed_tree).to eq(project_tree)
end
it 'does not de-duplicate entries' do
expect(parsed_tree['duped_hash_with_id']).not_to be(parsed_tree['array'][0]['duped_hash_with_id'])
end
end
context 'with de-duplicating entries' do
let(:parsed_tree) do
subject.load(fixture, dedup_entries: true)
end
it 'parses the JSON into the expected tree' do
expect(parsed_tree).to eq(project_tree)
end
it 'de-duplicates equal values' do
expect(parsed_tree['duped_hash_with_id']).to be(parsed_tree['array'][0]['duped_hash_with_id'])
expect(parsed_tree['duped_hash_with_id']).to be(parsed_tree['nested']['duped_hash_with_id'])
expect(parsed_tree['duped_array']).to be(parsed_tree['array'][1]['duped_array'])
expect(parsed_tree['duped_array']).to be(parsed_tree['nested']['duped_array'])
end
it 'does not de-duplicate hashes without IDs' do
expect(parsed_tree['duped_hash_no_id']).to eq(parsed_tree['array'][2]['duped_hash_no_id'])
expect(parsed_tree['duped_hash_no_id']).not_to be(parsed_tree['array'][2]['duped_hash_no_id'])
end
it 'keeps single entries intact' do
expect(parsed_tree['simple']).to eq(42)
expect(parsed_tree['nested']['array']).to eq(["don't touch"])
end
end
end
......@@ -450,7 +450,9 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do
context 'project.json file access check' do
let(:user) { create(:user) }
let!(:project) { create(:project, :builds_disabled, :issues_disabled, name: 'project', path: 'project') }
let(:project_tree_restorer) { described_class.new(user: user, shared: shared, project: project) }
let(:project_tree_restorer) do
described_class.new(user: user, shared: shared, project: project)
end
let(:restored_project_json) { project_tree_restorer.restore }
it 'does not read a symlink' do
......@@ -725,7 +727,9 @@ describe Gitlab::ImportExport::ProjectTreeRestorer do
let(:project) { create(:project) }
let(:user) { create(:user) }
let(:tree_hash) { { 'visibility_level' => visibility } }
let(:restorer) { described_class.new(user: user, shared: shared, project: project) }
let(:restorer) do
described_class.new(user: user, shared: shared, project: project)
end
before do
expect(restorer).to receive(:read_tree_hash) { tree_hash }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment