Commit 1f52a43b authored by Steve Abrams's avatar Steve Abrams

Merge branch 'yorick/fix-missing-commits-data' into 'master'

Fix merge request commits that are missing committer/author details

See merge request gitlab-org/gitlab!73307
parents 3f3ac780 9b553e50
# frozen_string_literal: true
class ScheduleFixMergeRequestDiffCommitUsersMigration < Gitlab::Database::Migration[1.0]
disable_ddl_transaction!
MIGRATION_CLASS = 'FixMergeRequestDiffCommitUsers'
class Project < ApplicationRecord
include EachBatch
self.table_name = 'projects'
end
def up
# This is the day on which we merged
# https://gitlab.com/gitlab-org/gitlab/-/merge_requests/63669. Since the
# deploy of this MR we may have imported projects using the old format, but
# after their merge_request_diff_id range had been migrated by Sidekiq. As a
# result, there may be rows without a committer_id or commit_author_id
# field.
date = '2021-07-07 00:00:00'
transaction do
Project.each_batch(of: 10_000) do |batch|
time = Time.now.utc
rows = batch
.where('created_at >= ?', date)
.where(import_type: 'gitlab_project')
.pluck(:id)
.map do |id|
Gitlab::Database::BackgroundMigrationJob.new(
class_name: MIGRATION_CLASS,
arguments: [id],
created_at: time,
updated_at: time
)
end
Gitlab::Database::BackgroundMigrationJob
.bulk_insert!(rows, validate: false)
end
end
job = Gitlab::Database::BackgroundMigrationJob
.for_migration_class(MIGRATION_CLASS)
.pending
.first
migrate_in(2.minutes, MIGRATION_CLASS, job.arguments) if job
end
def down
# no-op
end
end
385c540b1f80c31a5ba009ae3507d2b855a737389bcb75c07a8872f26f8a9b44
\ No newline at end of file
# frozen_string_literal: true
module Gitlab
module BackgroundMigration
# Background migration for fixing merge_request_diff_commit rows that don't
# have committer/author details due to
# https://gitlab.com/gitlab-org/gitlab/-/issues/344080.
#
# This migration acts on a single project and corrects its data. Because
# this process needs Git/Gitaly access, and duplicating all that code is far
# too much, this migration relies on global models such as Project,
# MergeRequest, etc.
class FixMergeRequestDiffCommitUsers
def initialize
@commits = {}
@users = {}
end
def perform(project_id)
if (project = ::Project.find_by_id(project_id))
process(project)
end
::Gitlab::Database::BackgroundMigrationJob.mark_all_as_succeeded(
'FixMergeRequestDiffCommitUsers',
[project_id]
)
schedule_next_job
end
def process(project)
# Loading everything using one big query may result in timeouts (e.g.
# for projects the size of gitlab-org/gitlab). So instead we query
# data on a per merge request basis.
project.merge_requests.each_batch do |mrs|
::MergeRequestDiffCommit
.select([
:merge_request_diff_id,
:relative_order,
:sha,
:committer_id,
:commit_author_id
])
.joins(merge_request_diff: :merge_request)
.where(merge_requests: { id: mrs.select(:id) })
.where('commit_author_id IS NULL OR committer_id IS NULL')
.each do |commit|
update_commit(project, commit)
end
end
end
# rubocop: disable Metrics/AbcSize
def update_commit(project, row)
commit = find_commit(project, row.sha)
updates = []
unless row.commit_author_id
author_id = find_or_create_user(commit, :author_name, :author_email)
updates << [arel_table[:commit_author_id], author_id] if author_id
end
unless row.committer_id
committer_id =
find_or_create_user(commit, :committer_name, :committer_email)
updates << [arel_table[:committer_id], committer_id] if committer_id
end
return if updates.empty?
update = Arel::UpdateManager
.new
.table(MergeRequestDiffCommit.arel_table)
.where(matches_row(row))
.set(updates)
.to_sql
MergeRequestDiffCommit.connection.execute(update)
end
# rubocop: enable Metrics/AbcSize
def schedule_next_job
job = Database::BackgroundMigrationJob
.for_migration_class('FixMergeRequestDiffCommitUsers')
.pending
.first
return unless job
BackgroundMigrationWorker.perform_in(
2.minutes,
'FixMergeRequestDiffCommitUsers',
job.arguments
)
end
def find_commit(project, sha)
@commits[sha] ||= (project.commit(sha)&.to_hash || {})
end
def find_or_create_user(commit, name_field, email_field)
name = commit[name_field]
email = commit[email_field]
return unless name && email
@users[[name, email]] ||=
MergeRequest::DiffCommitUser.find_or_create(name, email).id
end
def matches_row(row)
primary_key = Arel::Nodes::Grouping
.new([arel_table[:merge_request_diff_id], arel_table[:relative_order]])
primary_val = Arel::Nodes::Grouping
.new([row.merge_request_diff_id, row.relative_order])
primary_key.eq(primary_val)
end
def arel_table
MergeRequestDiffCommit.arel_table
end
end
end
end
...@@ -4,6 +4,7 @@ module Gitlab ...@@ -4,6 +4,7 @@ module Gitlab
module Database module Database
class BackgroundMigrationJob < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord class BackgroundMigrationJob < ActiveRecord::Base # rubocop:disable Rails/ApplicationRecord
include EachBatch include EachBatch
include BulkInsertSafe
self.table_name = :background_migration_jobs self.table_name = :background_migration_jobs
......
...@@ -47,15 +47,15 @@ module Gitlab ...@@ -47,15 +47,15 @@ module Gitlab
attributes attributes
end end
private def find_with_cache(key = cache_key)
return yield unless lru_cache && key
attr_reader :klass, :attributes, :lru_cache, :cache_key lru_cache[key] ||= yield
end
def find_with_cache private
return yield unless lru_cache && cache_key
lru_cache[cache_key] ||= yield attr_reader :klass, :attributes, :lru_cache, :cache_key
end
def cache_from_request_store def cache_from_request_store
Gitlab::SafeRequestStore[:lru_cache] ||= LruRedux::Cache.new(LRU_CACHE_SIZE) Gitlab::SafeRequestStore[:lru_cache] ||= LruRedux::Cache.new(LRU_CACHE_SIZE)
......
...@@ -401,10 +401,6 @@ excluded_attributes: ...@@ -401,10 +401,6 @@ excluded_attributes:
- :verification_checksum - :verification_checksum
- :verification_failure - :verification_failure
merge_request_diff_commits: merge_request_diff_commits:
- :author_name
- :author_email
- :committer_name
- :committer_email
- :merge_request_diff_id - :merge_request_diff_id
- :commit_author_id - :commit_author_id
- :committer_id - :committer_id
......
...@@ -29,6 +29,7 @@ module Gitlab ...@@ -29,6 +29,7 @@ module Gitlab
def find def find
return if epic? && group.nil? return if epic? && group.nil?
return find_diff_commit_user if diff_commit_user? return find_diff_commit_user if diff_commit_user?
return find_diff_commit if diff_commit?
super super
end end
...@@ -83,9 +84,38 @@ module Gitlab ...@@ -83,9 +84,38 @@ module Gitlab
end end
def find_diff_commit_user def find_diff_commit_user
find_with_cache do find_or_create_diff_commit_user(@attributes['name'], @attributes['email'])
MergeRequest::DiffCommitUser end
.find_or_create(@attributes['name'], @attributes['email'])
def find_diff_commit
row = @attributes.dup
# Diff commits come in two formats:
#
# 1. The old format where author/committer details are separate fields
# 2. The new format where author/committer details are nested objects,
# and pre-processed by `find_diff_commit_user`.
#
# The code here ensures we support both the old and new format.
aname = row.delete('author_name')
amail = row.delete('author_email')
cname = row.delete('committer_name')
cmail = row.delete('committer_email')
author = row.delete('commit_author')
committer = row.delete('committer')
row['commit_author'] = author ||
find_or_create_diff_commit_user(aname, amail)
row['committer'] = committer ||
find_or_create_diff_commit_user(cname, cmail)
MergeRequestDiffCommit.new(row)
end
def find_or_create_diff_commit_user(name, email)
find_with_cache([MergeRequest::DiffCommitUser, name, email]) do
MergeRequest::DiffCommitUser.find_or_create(name, email)
end end
end end
...@@ -113,6 +143,10 @@ module Gitlab ...@@ -113,6 +143,10 @@ module Gitlab
klass == MergeRequest::DiffCommitUser klass == MergeRequest::DiffCommitUser
end end
def diff_commit?
klass == MergeRequestDiffCommit
end
# If an existing group milestone used the IID # If an existing group milestone used the IID
# claim the IID back and set the group milestone to use one available # claim the IID back and set the group milestone to use one available
# This is necessary to fix situations like the following: # This is necessary to fix situations like the following:
......
...@@ -33,7 +33,8 @@ module Gitlab ...@@ -33,7 +33,8 @@ module Gitlab
links: 'Releases::Link', links: 'Releases::Link',
metrics_setting: 'ProjectMetricsSetting', metrics_setting: 'ProjectMetricsSetting',
commit_author: 'MergeRequest::DiffCommitUser', commit_author: 'MergeRequest::DiffCommitUser',
committer: 'MergeRequest::DiffCommitUser' }.freeze committer: 'MergeRequest::DiffCommitUser',
merge_request_diff_commits: 'MergeRequestDiffCommit' }.freeze
BUILD_MODELS = %i[Ci::Build commit_status].freeze BUILD_MODELS = %i[Ci::Build commit_status].freeze
...@@ -59,6 +60,7 @@ module Gitlab ...@@ -59,6 +60,7 @@ module Gitlab
external_pull_requests external_pull_requests
DesignManagement::Design DesignManagement::Design
MergeRequest::DiffCommitUser MergeRequest::DiffCommitUser
MergeRequestDiffCommit
].freeze ].freeze
def create def create
......
# frozen_string_literal: true
require 'spec_helper'
# The underlying migration relies on the global models (e.g. Project). This
# means we also need to use FactoryBot factories to ensure everything is
# operating using the same types. If we use `table()` and similar methods we
# would have to duplicate a lot of logic just for these tests.
#
# rubocop: disable RSpec/FactoriesInMigrationSpecs
RSpec.describe Gitlab::BackgroundMigration::FixMergeRequestDiffCommitUsers do
let(:migration) { described_class.new }
describe '#perform' do
context 'when the project exists' do
it 'processes the project' do
project = create(:project)
expect(migration).to receive(:process).with(project)
expect(migration).to receive(:schedule_next_job)
migration.perform(project.id)
end
it 'marks the background job as finished' do
project = create(:project)
Gitlab::Database::BackgroundMigrationJob.create!(
class_name: 'FixMergeRequestDiffCommitUsers',
arguments: [project.id]
)
migration.perform(project.id)
job = Gitlab::Database::BackgroundMigrationJob
.find_by(class_name: 'FixMergeRequestDiffCommitUsers')
expect(job.status).to eq('succeeded')
end
end
context 'when the project does not exist' do
it 'does nothing' do
expect(migration).not_to receive(:process)
expect(migration).to receive(:schedule_next_job)
migration.perform(-1)
end
end
end
describe '#update_commit' do
let(:project) { create(:project, :repository) }
let(:mr) do
create(
:merge_request_with_diffs,
source_project: project,
target_project: project
)
end
let(:diff) { mr.merge_request_diffs.first }
let(:commit) { project.commit }
def update_row(migration, project, diff, row)
migration.update_commit(project, row)
diff
.merge_request_diff_commits
.find_by(sha: row.sha, relative_order: row.relative_order)
end
it 'populates missing commit authors' do
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.commit_author.name).to eq(commit.to_hash[:author_name])
expect(updated.commit_author.email).to eq(commit.to_hash[:author_email])
end
it 'populates missing committers' do
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.committer.name).to eq(commit.to_hash[:committer_name])
expect(updated.committer.email).to eq(commit.to_hash[:committer_email])
end
it 'leaves existing commit authors as-is' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
commit_author: user
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.commit_author).to eq(user)
end
it 'leaves existing committers as-is' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
committer: user
)
updated = update_row(migration, project, diff, commit_row)
expect(updated.committer).to eq(user)
end
it 'does nothing when both the author and committer are present' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
committer: user,
commit_author: user
)
recorder = ActiveRecord::QueryRecorder.new do
migration.update_commit(project, commit_row)
end
expect(recorder.count).to be_zero
end
it 'does nothing if the commit does not exist in Git' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: 'kittens',
relative_order: 9000,
committer: user,
commit_author: user
)
recorder = ActiveRecord::QueryRecorder.new do
migration.update_commit(project, commit_row)
end
expect(recorder.count).to be_zero
end
it 'does nothing when the committer/author are missing in the Git commit' do
user = create(:merge_request_diff_commit_user)
commit_row = create(
:merge_request_diff_commit,
merge_request_diff: diff,
sha: commit.sha,
relative_order: 9000,
committer: user,
commit_author: user
)
allow(migration).to receive(:find_or_create_user).and_return(nil)
recorder = ActiveRecord::QueryRecorder.new do
migration.update_commit(project, commit_row)
end
expect(recorder.count).to be_zero
end
end
describe '#schedule_next_job' do
it 'schedules the next background migration' do
Gitlab::Database::BackgroundMigrationJob
.create!(class_name: 'FixMergeRequestDiffCommitUsers', arguments: [42])
expect(BackgroundMigrationWorker)
.to receive(:perform_in)
.with(2.minutes, 'FixMergeRequestDiffCommitUsers', [42])
migration.schedule_next_job
end
it 'does nothing when there are no jobs' do
expect(BackgroundMigrationWorker)
.not_to receive(:perform_in)
migration.schedule_next_job
end
end
describe '#find_commit' do
let(:project) { create(:project, :repository) }
it 'finds a commit using Git' do
commit = project.commit
found = migration.find_commit(project, commit.sha)
expect(found).to eq(commit.to_hash)
end
it 'caches the results' do
commit = project.commit
migration.find_commit(project, commit.sha)
expect { migration.find_commit(project, commit.sha) }
.not_to change { Gitlab::GitalyClient.get_request_count }
end
it 'returns an empty hash if the commit does not exist' do
expect(migration.find_commit(project, 'kittens')).to eq({})
end
end
describe '#find_or_create_user' do
let(:project) { create(:project, :repository) }
it 'creates missing users' do
commit = project.commit.to_hash
id = migration.find_or_create_user(commit, :author_name, :author_email)
expect(MergeRequest::DiffCommitUser.count).to eq(1)
created = MergeRequest::DiffCommitUser.first
expect(created.name).to eq(commit[:author_name])
expect(created.email).to eq(commit[:author_email])
expect(created.id).to eq(id)
end
it 'returns users that already exist' do
commit = project.commit.to_hash
user1 = migration.find_or_create_user(commit, :author_name, :author_email)
user2 = migration.find_or_create_user(commit, :author_name, :author_email)
expect(user1).to eq(user2)
end
it 'caches the results' do
commit = project.commit.to_hash
migration.find_or_create_user(commit, :author_name, :author_email)
recorder = ActiveRecord::QueryRecorder.new do
migration.find_or_create_user(commit, :author_name, :author_email)
end
expect(recorder.count).to be_zero
end
it 'returns nil if the commit details are missing' do
id = migration.find_or_create_user({}, :author_name, :author_email)
expect(id).to be_nil
end
end
describe '#matches_row' do
it 'returns the query matches for the composite primary key' do
row = double(:commit, merge_request_diff_id: 4, relative_order: 5)
arel = migration.matches_row(row)
expect(arel.to_sql).to eq(
'("merge_request_diff_commits"."merge_request_diff_id", "merge_request_diff_commits"."relative_order") = (4, 5)'
)
end
end
end
# rubocop: enable RSpec/FactoriesInMigrationSpecs
...@@ -176,4 +176,118 @@ RSpec.describe Gitlab::ImportExport::Project::ObjectBuilder do ...@@ -176,4 +176,118 @@ RSpec.describe Gitlab::ImportExport::Project::ObjectBuilder do
expect(found.email).to eq('alice@example.com') expect(found.email).to eq('alice@example.com')
end end
end end
context 'merge request diff commits' do
context 'when the "committer" object is present' do
it 'uses this object as the committer' do
user = MergeRequest::DiffCommitUser
.find_or_create('Alice', 'alice@example.com')
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer' => user,
'committer_name' => 'Bla',
'committer_email' => 'bla@example.com',
'author_name' => 'Bla',
'author_email' => 'bla@example.com'
}
)
expect(commit.committer).to eq(user)
end
end
context 'when the "committer" object is missing' do
it 'creates one from the committer name and Email' do
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer_name' => 'Alice',
'committer_email' => 'alice@example.com',
'author_name' => 'Alice',
'author_email' => 'alice@example.com'
}
)
expect(commit.committer.name).to eq('Alice')
expect(commit.committer.email).to eq('alice@example.com')
end
end
context 'when the "commit_author" object is present' do
it 'uses this object as the author' do
user = MergeRequest::DiffCommitUser
.find_or_create('Alice', 'alice@example.com')
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer_name' => 'Alice',
'committer_email' => 'alice@example.com',
'commit_author' => user,
'author_name' => 'Bla',
'author_email' => 'bla@example.com'
}
)
expect(commit.commit_author).to eq(user)
end
end
context 'when the "commit_author" object is missing' do
it 'creates one from the author name and Email' do
commit = described_class.build(
MergeRequestDiffCommit,
{
'committer_name' => 'Alice',
'committer_email' => 'alice@example.com',
'author_name' => 'Alice',
'author_email' => 'alice@example.com'
}
)
expect(commit.commit_author.name).to eq('Alice')
expect(commit.commit_author.email).to eq('alice@example.com')
end
end
end
describe '#find_or_create_diff_commit_user' do
context 'when the user already exists' do
it 'returns the existing user' do
user = MergeRequest::DiffCommitUser
.find_or_create('Alice', 'alice@example.com')
found = described_class
.new(MergeRequestDiffCommit, {})
.send(:find_or_create_diff_commit_user, user.name, user.email)
expect(found).to eq(user)
end
end
context 'when the user does not exist' do
it 'creates the user' do
found = described_class
.new(MergeRequestDiffCommit, {})
.send(:find_or_create_diff_commit_user, 'Alice', 'alice@example.com')
expect(found.name).to eq('Alice')
expect(found.email).to eq('alice@example.com')
end
end
it 'caches the results' do
builder = described_class.new(MergeRequestDiffCommit, {})
builder.send(:find_or_create_diff_commit_user, 'Alice', 'alice@example.com')
record = ActiveRecord::QueryRecorder.new do
builder.send(:find_or_create_diff_commit_user, 'Alice', 'alice@example.com')
end
expect(record.count).to eq(1)
end
end
end end
# frozen_string_literal: true
require 'spec_helper'
require_migration! 'schedule_fix_merge_request_diff_commit_users_migration'
RSpec.describe ScheduleFixMergeRequestDiffCommitUsersMigration, :migration do
let(:migration) { described_class.new }
let(:namespaces) { table(:namespaces) }
let(:projects) { table(:projects) }
let(:namespace) { namespaces.create!(name: 'foo', path: 'foo') }
describe '#up' do
it 'does nothing when there are no projects to correct' do
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to be_zero
end
it 'schedules imported projects created after July' do
project = projects.create!(
namespace_id: namespace.id,
import_type: 'gitlab_project',
created_at: '2021-08-01'
)
expect(migration)
.to receive(:migrate_in)
.with(2.minutes, 'FixMergeRequestDiffCommitUsers', [project.id])
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to eq(1)
job = Gitlab::Database::BackgroundMigrationJob.first
expect(job.class_name).to eq('FixMergeRequestDiffCommitUsers')
expect(job.arguments).to eq([project.id])
end
it 'ignores projects imported before July' do
projects.create!(
namespace_id: namespace.id,
import_type: 'gitlab_project',
created_at: '2020-08-01'
)
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to be_zero
end
it 'ignores projects that are not imported' do
projects.create!(
namespace_id: namespace.id,
created_at: '2021-08-01'
)
migration.up
expect(Gitlab::Database::BackgroundMigrationJob.count).to be_zero
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment