Commit af4b0bd1 authored by Simon Schrottner's avatar Simon Schrottner

Change BitBucket Pull Request Import to a Batch Process

As BitBucket importer tried to fetch all Pull Requests upfront, before
importing it, the memory consumption got very high. So high, that bigger
projects could never be imported fully, as they had too many Pull
Requests.

This change should reduce the amount of occupied memory, as the Pull
Requests are now fetched and imported in a batch of 100. As it used to
be done before hand, for creating pull request branches.

relates to https://gitlab.com/gitlab-org/gitlab/-/issues/23586
parent 369065d7
---
title: Add Batch Support for Importing Pull Requests from Bitbucket
merge_request: 46696
author: Simon Schrottner
type: performance
...@@ -8,9 +8,9 @@ module BitbucketServer ...@@ -8,9 +8,9 @@ module BitbucketServer
@connection = Connection.new(options) @connection = Connection.new(options)
end end
def pull_requests(project_key, repo) def pull_requests(project_key, repo, page_offset: 0, limit: nil)
path = "/projects/#{project_key}/repos/#{repo}/pull-requests?state=ALL" path = "/projects/#{project_key}/repos/#{repo}/pull-requests?state=ALL"
get_collection(path, :pull_request) get_collection(path, :pull_request, page_offset: page_offset, limit: limit)
end end
def activities(project_key, repo, pull_request_id) def activities(project_key, repo, pull_request_id)
......
...@@ -177,16 +177,24 @@ module Gitlab ...@@ -177,16 +177,24 @@ module Gitlab
# on the remote server. Then we have to issue a `git fetch` to download these # on the remote server. Then we have to issue a `git fetch` to download these
# branches. # branches.
def import_pull_requests def import_pull_requests
log_info(stage: 'import_pull_requests', message: 'starting') page = 0
pull_requests = client.pull_requests(project_key, repository_slug).to_a
# Creating branches on the server and fetching the newly-created branches log_info(stage: 'import_pull_requests', message: "starting")
# may take a number of network round-trips. Do this in batches so that we can
# avoid doing a git fetch for every new branch.
pull_requests.each_slice(BATCH_SIZE) do |batch|
restore_branches(batch) if recover_missing_commits
batch.each do |pull_request| loop do
log_debug(stage: 'import_pull_requests', message: "importing page #{page} and batch-size #{BATCH_SIZE} from #{page * BATCH_SIZE} to #{(page + 1) * BATCH_SIZE}")
pull_requests = client.pull_requests(project_key, repository_slug, page_offset: page, limit: BATCH_SIZE).to_a
break if pull_requests.empty?
# Creating branches on the server and fetching the newly-created branches
# may take a number of network round-trips. This used to be done in batches to
# avoid doing a git fetch for every new branch, as the whole process is now
# batched, we do not need to separately do this in batches.
restore_branches(pull_requests) if recover_missing_commits
pull_requests.each do |pull_request|
if already_imported?(pull_request) if already_imported?(pull_request)
log_info(stage: 'import_pull_requests', message: 'already imported', iid: pull_request.iid) log_info(stage: 'import_pull_requests', message: 'already imported', iid: pull_request.iid)
else else
...@@ -201,6 +209,9 @@ module Gitlab ...@@ -201,6 +209,9 @@ module Gitlab
backtrace = Gitlab::BacktraceCleaner.clean_backtrace(e.backtrace) backtrace = Gitlab::BacktraceCleaner.clean_backtrace(e.backtrace)
errors << { type: :pull_request, iid: pull_request.iid, errors: e.message, backtrace: backtrace.join("\n"), raw_response: pull_request.raw } errors << { type: :pull_request, iid: pull_request.iid, errors: e.message, backtrace: backtrace.join("\n"), raw_response: pull_request.raw }
end end
log_debug(stage: 'import_pull_requests', message: "finished page #{page} and batch-size #{BATCH_SIZE}")
page += 1
end end
end end
...@@ -416,6 +427,10 @@ module Gitlab ...@@ -416,6 +427,10 @@ module Gitlab
} }
end end
def log_debug(details)
logger.debug(log_base_data.merge(details))
end
def log_info(details) def log_info(details)
logger.info(log_base_data.merge(details)) logger.info(log_base_data.merge(details))
end end
......
...@@ -19,6 +19,15 @@ RSpec.describe BitbucketServer::Client do ...@@ -19,6 +19,15 @@ RSpec.describe BitbucketServer::Client do
subject.pull_requests(project, repo_slug) subject.pull_requests(project, repo_slug)
end end
it 'requests a collection with offset and limit' do
offset = 10
limit = 100
expect(BitbucketServer::Paginator).to receive(:new).with(anything, path, :pull_request, page_offset: offset, limit: limit)
subject.pull_requests(project, repo_slug, page_offset: offset, limit: limit)
end
end end
describe '#activities' do describe '#activities' do
......
...@@ -112,7 +112,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do ...@@ -112,7 +112,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do
allow(subject).to receive(:delete_temp_branches) allow(subject).to receive(:delete_temp_branches)
allow(subject).to receive(:restore_branches) allow(subject).to receive(:restore_branches)
allow(subject.client).to receive(:pull_requests).and_return([pull_request]) allow(subject.client).to receive(:pull_requests).and_return([pull_request], [])
end end
# As we are using Caching with redis, it is best to clean the cache after each test run, else we need to wait for # As we are using Caching with redis, it is best to clean the cache after each test run, else we need to wait for
...@@ -499,7 +499,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do ...@@ -499,7 +499,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do
before do before do
Gitlab::Cache::Import::Caching.set_add(subject.already_imported_cache_key, pull_request_already_imported.iid) Gitlab::Cache::Import::Caching.set_add(subject.already_imported_cache_key, pull_request_already_imported.iid)
allow(subject.client).to receive(:pull_requests).and_return([pull_request_to_be_imported, pull_request_already_imported]) allow(subject.client).to receive(:pull_requests).and_return([pull_request_to_be_imported, pull_request_already_imported], [])
end end
it 'only imports one Merge Request, as the other on is in the cache' do it 'only imports one Merge Request, as the other on is in the cache' do
...@@ -535,7 +535,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do ...@@ -535,7 +535,7 @@ RSpec.describe Gitlab::BitbucketServerImport::Importer do
updated_at: Time.now, updated_at: Time.now,
merged?: true) merged?: true)
expect(subject.client).to receive(:pull_requests).and_return([pull_request]) expect(subject.client).to receive(:pull_requests).and_return([pull_request], [])
expect(subject.client).to receive(:activities).and_return([]) expect(subject.client).to receive(:activities).and_return([])
expect(subject).to receive(:import_repository).twice expect(subject).to receive(:import_repository).twice
end end
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment