Commit 098bc7cf authored by charlieablett's avatar charlieablett

Add label backup table migration

Create a backup table called `labels_backup`
that will be populated with deduplicated labels
and the nature of the change so things can be
restored if needed.
parent 1b4d6d6f
---
title: Deduplicate labels with identical title and project
merge_request: 21384
author:
type: changed
# frozen_string_literal: true
class AddLabelRestoreTable < ActiveRecord::Migration[6.0]
DOWNTIME = false
def up
# copy table
execute "CREATE TABLE #{backup_labels_table_name} (LIKE #{labels_table_name} INCLUDING ALL);"
# create foreign keys
connection.foreign_keys(labels_table_name).each do |fk|
fk_options = fk.options
execute "ALTER TABLE #{backup_labels_table_name} ADD CONSTRAINT #{fk.name} FOREIGN KEY (#{fk_options[:column]}) REFERENCES #{fk.to_table}(#{fk_options[:primary_key]});"
end
# make the primary key a real functioning one rather than incremental
execute "ALTER TABLE #{backup_labels_table_name} ALTER COLUMN ID DROP DEFAULT;"
# add some fields that make changes trackable
execute "ALTER TABLE #{backup_labels_table_name} ADD COLUMN restore_action INTEGER;"
execute "ALTER TABLE #{backup_labels_table_name} ADD COLUMN new_title VARCHAR;"
end
def down
drop_table backup_labels_table_name
end
private
def labels_table_name
:labels
end
def backup_labels_table_name
:backup_labels
end
end
# frozen_string_literal: true
class RemoveDuplicateLabelsFromProject < ActiveRecord::Migration[6.0]
DOWNTIME = false
CREATE = 1
RENAME = 2
class BackupLabel < Label
include BulkInsertSafe
include EachBatch
self.table_name = 'backup_labels'
end
class Label < ApplicationRecord
self.table_name = 'labels'
end
class Project < ApplicationRecord
include EachBatch
self.table_name = 'projects'
end
DEDUPLICATE_BATCH_SIZE = 100_000
RESTORE_BATCH_SIZE = 100
def up
# Split to smaller chunks
# Loop rather than background job, every 100,000
# there are 45,000,000 projects in total
Project.each_batch(of: DEDUPLICATE_BATCH_SIZE) do |batch|
range = batch.pluck('MIN(id)', 'MAX(id)').first
remove_full_duplicates(*range)
rename_partial_duplicates(*range)
end
end
def down
# we could probably make this more efficient by getting them in bulk by restore action
# and then applying them all at the same time...
BackupLabel.each_batch(of: RESTORE_BATCH_SIZE) do |backup_label|
action = backup_label.restore_action
target = Label.find(backup_label.id)
next unless target
next unless action
if action == RENAME
if target.title == backup_label.new_title
say "Restoring label title '#{target.title}' to backup value '#{backup_label.title}'"
target.update_attribute(:title, backup_label.title)
end
elsif action == CREATE
restored_label = Label.new(backup_label.attributes)
say "Restoring label from deletion with backup attributes #{backup_label.attributes}"
if restored_label.valid?
restored_label.save
backup_label.destroy
end
end
end
end
def remove_full_duplicates(start_id, stop_id)
# Fields that are considered duplicate:
# project_id title template description type color
duplicate_labels = ApplicationRecord.connection.execute(<<-SQL.squish)
WITH data AS (
SELECT labels.*,
row_number() OVER (PARTITION BY labels.project_id, labels.title, labels.template, labels.description, labels.type, labels.color ORDER BY labels.id) AS row_number,
#{CREATE} AS restore_action
FROM labels
WHERE labels.project_id BETWEEN #{start_id} AND #{stop_id}
AND NOT EXISTS (SELECT * FROM board_labels WHERE board_labels.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM label_links WHERE label_links.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM label_priorities WHERE label_priorities.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM lists WHERE lists.label_id = labels.id)
AND NOT EXISTS (SELECT * FROM resource_label_events WHERE resource_label_events.label_id = labels.id)
) SELECT * FROM data WHERE row_number > 1;
SQL
if duplicate_labels.any?
# create backup records
BackupLabel.insert_all!(duplicate_labels.map { |label| label.except("row_number") })
ApplicationRecord.connection.execute(<<-SQL.squish)
DELETE FROM labels
WHERE labels.id IN (#{duplicate_labels.map { |dup| dup["id"] }.join(", ")});
SQL
end
end
def rename_partial_duplicates(start_id, stop_id)
soft_duplicates = ApplicationRecord.connection.execute(<<-SQL.squish)
WITH data AS (
SELECT
*,
title || '_' || 'duplicate' || extract(epoch from now()) AS new_title,
#{RENAME} AS restore_action,
row_number() OVER (PARTITION BY project_id, title ORDER BY id) AS row_number
FROM labels
WHERE project_id BETWEEN #{start_id} AND #{stop_id}
) SELECT * from data where row_number > 1;
SQL
if soft_duplicates.any?
# create backup records
BackupLabel.insert_all!(soft_duplicates.map { |label| label.except("row_number") })
ApplicationRecord.connection.execute(<<-SQL.squish)
UPDATE labels SET title = title || '_' || 'duplicate' || extract(epoch from now())
WHERE labels.id IN (#{soft_duplicates.map { |dup| dup["id"] }.join(", ")});
SQL
end
end
end
# frozen_string_literal: true
class AddUniquenessIndexToLabelTitleAndProject < ActiveRecord::Migration[6.0]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
disable_ddl_transaction!
PROJECT_AND_TITLE = [:project_id, :title]
def up
remove_concurrent_index :labels, PROJECT_AND_TITLE if index_exists? :labels, PROJECT_AND_TITLE
add_concurrent_index :labels, PROJECT_AND_TITLE, where: "labels.group_id = null", unique: true
end
def down
remove_concurrent_index :labels, PROJECT_AND_TITLE if index_exists? :labels, PROJECT_AND_TITLE
add_concurrent_index :labels, PROJECT_AND_TITLE, where: "labels.group_id = null", unique: false
end
end
......@@ -9405,6 +9405,23 @@ CREATE TABLE public.aws_roles (
role_external_id character varying(64) NOT NULL
);
CREATE TABLE public.backup_labels (
id integer NOT NULL,
title character varying,
color character varying,
project_id integer,
created_at timestamp without time zone,
updated_at timestamp without time zone,
template boolean DEFAULT false,
description character varying,
description_html text,
type character varying,
group_id integer,
cached_markdown_version integer,
restore_action integer,
new_title character varying
);
CREATE TABLE public.badges (
id integer NOT NULL,
link_url character varying NOT NULL,
......@@ -17224,6 +17241,9 @@ ALTER TABLE ONLY public.award_emoji
ALTER TABLE ONLY public.aws_roles
ADD CONSTRAINT aws_roles_pkey PRIMARY KEY (user_id);
ALTER TABLE ONLY public.backup_labels
ADD CONSTRAINT backup_labels_pkey PRIMARY KEY (id);
ALTER TABLE ONLY public.badges
ADD CONSTRAINT badges_pkey PRIMARY KEY (id);
......@@ -18358,6 +18378,20 @@ CREATE UNIQUE INDEX any_approver_project_rule_type_unique_index ON public.approv
CREATE UNIQUE INDEX approval_rule_name_index_for_code_owners ON public.approval_merge_request_rules USING btree (merge_request_id, code_owner, name) WHERE ((code_owner = true) AND (section IS NULL));
CREATE UNIQUE INDEX backup_labels_group_id_project_id_title_idx ON public.backup_labels USING btree (group_id, project_id, title);
CREATE INDEX backup_labels_group_id_title_idx ON public.backup_labels USING btree (group_id, title) WHERE (project_id = NULL::integer);
CREATE INDEX backup_labels_project_id_idx ON public.backup_labels USING btree (project_id);
CREATE INDEX backup_labels_project_id_title_idx ON public.backup_labels USING btree (project_id, title) WHERE (group_id = NULL::integer);
CREATE INDEX backup_labels_template_idx ON public.backup_labels USING btree (template) WHERE template;
CREATE INDEX backup_labels_title_idx ON public.backup_labels USING btree (title);
CREATE INDEX backup_labels_type_project_id_idx ON public.backup_labels USING btree (type, project_id);
CREATE INDEX ci_builds_gitlab_monitor_metrics ON public.ci_builds USING btree (status, created_at, project_id) WHERE ((type)::text = 'Ci::Build'::text);
CREATE INDEX code_owner_approval_required ON public.protected_branches USING btree (project_id, code_owner_approval_required) WHERE (code_owner_approval_required = true);
......@@ -19364,7 +19398,7 @@ CREATE INDEX index_labels_on_group_id_and_title ON public.labels USING btree (gr
CREATE INDEX index_labels_on_project_id ON public.labels USING btree (project_id);
CREATE INDEX index_labels_on_project_id_and_title ON public.labels USING btree (project_id, title) WHERE (group_id = NULL::integer);
CREATE UNIQUE INDEX index_labels_on_project_id_and_title ON public.labels USING btree (project_id, title) WHERE (group_id = NULL::integer);
CREATE INDEX index_labels_on_template ON public.labels USING btree (template) WHERE template;
......@@ -21002,6 +21036,9 @@ ALTER TABLE ONLY public.vulnerabilities
ALTER TABLE ONLY public.labels
ADD CONSTRAINT fk_7de4989a69 FOREIGN KEY (project_id) REFERENCES public.projects(id) ON DELETE CASCADE;
ALTER TABLE ONLY public.backup_labels
ADD CONSTRAINT fk_7de4989a69 FOREIGN KEY (project_id) REFERENCES public.projects(id);
ALTER TABLE ONLY public.merge_requests
ADD CONSTRAINT fk_7e85395a64 FOREIGN KEY (sprint_id) REFERENCES public.sprints(id) ON DELETE CASCADE;
......@@ -22208,6 +22245,9 @@ ALTER TABLE ONLY public.serverless_domain_cluster
ALTER TABLE ONLY public.labels
ADD CONSTRAINT fk_rails_c1ac5161d8 FOREIGN KEY (group_id) REFERENCES public.namespaces(id) ON DELETE CASCADE;
ALTER TABLE ONLY public.backup_labels
ADD CONSTRAINT fk_rails_c1ac5161d8 FOREIGN KEY (group_id) REFERENCES public.namespaces(id);
ALTER TABLE ONLY public.project_feature_usages
ADD CONSTRAINT fk_rails_c22a50024b FOREIGN KEY (project_id) REFERENCES public.projects(id) ON DELETE CASCADE;
......@@ -23195,6 +23235,9 @@ COPY "schema_migrations" (version) FROM STDIN;
20200304160801
20200304160823
20200304211738
20200305020458
20200305082754
20200305082858
20200305121159
20200305151736
20200305200641
......
# frozen_string_literal: true
require 'spec_helper'
require Rails.root.join('db', 'migrate', '20200305082754_remove_duplicate_labels_from_project.rb')
describe RemoveDuplicateLabelsFromProject do
let(:labels_table) { table(:labels) }
let(:labels) { labels_table.all }
let(:projects_table) { table(:projects) }
let(:projects) { projects_table.all }
let(:namespaces_table) { table(:namespaces) }
let(:namespaces) { namespaces_table.all }
let(:backup_labels_table) { table(:backup_labels) }
let(:backup_labels) { backup_labels_table.all }
# all the possible tables with records that may have a relationship with a label
let(:analytics_cycle_analytics_group_stages_table) { table(:analytics_cycle_analytics_group_stages) }
let(:analytics_cycle_analytics_project_stages_table) { table(:analytics_cycle_analytics_project_stages) }
let(:board_labels_table) { table(:board_labels) }
let(:label_links_table) { table(:label_links) }
let(:label_priorities_table) { table(:label_priorities) }
let(:lists_table) { table(:lists) }
let(:resource_label_events_table) { table(:resource_label_events) }
let!(:group_one) { namespaces_table.create!(id: 1, type: 'Group', name: 'group', path: 'group') }
let!(:project_one) do
projects_table.create!(id: 1, name: 'project', path: 'project',
visibility_level: 0, namespace_id: group_one.id)
end
let(:label_title) { 'bug' }
let(:label_color) { 'red' }
let(:label_description) { 'nice label' }
let(:group_id) { group_one.id }
let(:project_id) { project_one.id }
let(:other_title) { 'feature' }
let(:group_label_attributes) do
{
title: label_title, color: label_color, group_id: group_id, type: 'GroupLabel', template: false, description: label_description
}
end
let(:project_label_attributes) do
{
title: label_title, color: label_color, project_id: project_id, type: 'ProjectLabel', template: false, description: label_description
}
end
let(:migration) { described_class.new }
before do
stub_const('CREATE', 1)
stub_const('RENAME', 2)
end
describe 'removing full duplicates' do
context 'when there are no duplicate labels' do
let!(:first_label) { labels_table.create(project_label_attributes.merge(id: 1, title: "a different label")) }
let!(:second_label) { labels_table.create(project_label_attributes.merge(id: 2, title: "a totally different label")) }
it 'does not remove anything' do
expect { migration.up }.not_to change { backup_labels_table.count }
end
it 'restores removed records - no change' do
migration.up
expect { migration.down }.not_to change { labels_table.count }
end
end
context 'with duplicates with no relationships' do
let!(:first_label) { labels_table.create(project_label_attributes.merge(id: 1)) }
let!(:second_label) { labels_table.create(project_label_attributes.merge(id: 2)) }
let!(:third_label) { labels_table.create(project_label_attributes.merge(id: 3, title: other_title)) }
let!(:fourth_label) { labels_table.create(project_label_attributes.merge(id: 4, title: other_title)) }
it 'creates a backup record for each removed record' do
expect { migration.up }.to change { backup_labels_table.count }.from(0).to(2)
end
it 'creates the correct backup records with `create` restore_action' do
migration.up
# can't use the activerecord class because the `type` makes it think it has polymorphism and should be/have a ProjectLabel subclass
backup_labels = ApplicationRecord.connection.execute('SELECT * from backup_labels')
expect(backup_labels.find { |bl| bl["id"] == 2 }).to include(second_label.attributes.merge("restore_action" => CREATE, "new_title" => nil, "created_at" => anything, "updated_at" => anything))
expect(backup_labels.find { |bl| bl["id"] == 4 }).to include(fourth_label.attributes.merge("restore_action" => CREATE, "new_title" => nil, "created_at" => anything, "updated_at" => anything))
end
it 'deletes all but one' do
migration.up
expect { second_label.reload }.to raise_error(ActiveRecord::RecordNotFound)
expect { fourth_label.reload }.to raise_error(ActiveRecord::RecordNotFound)
end
it 'restores removed records' do
second_label_attributes = second_label.attributes
fourth_label_attributes = fourth_label.attributes
migration.up
migration.down
expect(second_label.attributes).to eq(second_label_attributes)
expect(fourth_label.attributes).to eq(fourth_label_attributes)
end
end
context 'two duplicate records, one of which has a relationship' do
let!(:first_label) { labels_table.create(project_label_attributes.merge(id: 1)) }
let!(:second_label) { labels_table.create(project_label_attributes.merge(id: 2)) }
let!(:label_priority) { label_priorities_table.create(label_id: second_label.id, project_id: project_id, priority: 1) }
it 'does not remove anything' do
expect { migration.up }.not_to change { labels_table.count }
end
it 'does not create a backup record with `create` restore_action' do
expect { migration.up }.not_to change { backup_labels_table.where(restore_action: CREATE).count }
end
it 'restores removed records' do
migration.up
expect { migration.down }.not_to change { labels_table.count }
end
end
context 'multiple duplicates, a subset of which have relationships' do
let!(:first_label) { labels_table.create(project_label_attributes.merge(id: 1)) }
let!(:second_label) { labels_table.create(project_label_attributes.merge(id: 2)) }
let!(:label_priority_for_second_label) { label_priorities_table.create(label_id: second_label.id, project_id: project_id, priority: 1) }
let!(:third_label) { labels_table.create(project_label_attributes.merge(id: 3)) }
let!(:fourth_label) { labels_table.create(project_label_attributes.merge(id: 4)) }
let!(:label_priority_for_fourth_label) { label_priorities_table.create(label_id: fourth_label.id, project_id: project_id, priority: 2) }
it 'creates a backup record with `create` restore_action for each removed record' do
expect { migration.up }.to change { backup_labels_table.where(restore_action: CREATE).count }.from(0).to(1)
end
it 'creates the correct backup records' do
migration.up
# can't use the activerecord class because the `type` column makes it think it has polymorphism and should be/have a ProjectLabel subclass
backup_labels = ApplicationRecord.connection.execute('SELECT * from backup_labels')
expect(backup_labels.find { |bl| bl["id"] == 3 }).to include(third_label.attributes.merge("restore_action" => CREATE, "new_title" => nil, "created_at" => anything, "updated_at" => anything))
end
it 'deletes the third record' do
migration.up
expect { first_label.reload }.not_to raise_error
expect { second_label.reload }.not_to raise_error
expect { third_label.reload }.to raise_error(ActiveRecord::RecordNotFound)
end
it 'restores removed records' do
third_label_attributes = third_label.attributes
migration.up
migration.down
expect(third_label.attributes).to eq(third_label_attributes)
end
end
end
describe 'renaming partial duplicates' do
# partial duplicates - only project_id and title match. Distinct colour prevents deletion.
context 'when there are no duplicate labels' do
let!(:first_label) { labels_table.create(project_label_attributes.merge(id: 1, title: "a unique label", color: 'green')) }
let!(:second_label) { labels_table.create(project_label_attributes.merge(id: 2, title: "a totally different, unique, label", color: 'blue')) }
it 'does not rename anything' do
expect { migration.up }.not_to change { backup_labels_table.count }
end
end
context 'with duplicates with no relationships' do
let!(:first_label) { labels_table.create(project_label_attributes.merge(id: 1, color: 'green')) }
let!(:second_label) { labels_table.create(project_label_attributes.merge(id: 2, color: 'blue')) }
let!(:third_label) { labels_table.create(project_label_attributes.merge(id: 3, title: other_title, color: 'purple')) }
let!(:fourth_label) { labels_table.create(project_label_attributes.merge(id: 4, title: other_title, color: 'yellow')) }
it 'creates a backup record for each renamed record' do
expect { migration.up }.to change { backup_labels_table.count }.from(0).to(2)
end
it 'creates the correct backup records with `rename` restore_action' do
migration.up
# can't use the activerecord class because the `type` makes it think it has polymorphism and should be/have a ProjectLabel subclass
backup_labels = ApplicationRecord.connection.execute('SELECT * from backup_labels')
expect(backup_labels.find { |bl| bl["id"] == 2 }).to include(second_label.attributes.merge("restore_action" => RENAME, "created_at" => anything, "updated_at" => anything))
expect(backup_labels.find { |bl| bl["id"] == 4 }).to include(fourth_label.attributes.merge("restore_action" => RENAME, "created_at" => anything, "updated_at" => anything))
end
it 'modifies the title of the second label' do
expect { migration.up }.to change { second_label.reload.title }.from(label_title).to(a_string_matching(/#{label_title}_duplicate/))
end
it 'modifies the title of the fourth label' do
expect { migration.up }.to change { fourth_label.reload.title }.from(other_title).to(a_string_matching(/#{other_title}_duplicate/))
end
it 'renames all but one' do
migration.up
expect(second_label.reload.title).to match(/#{label_title}_duplicate/)
expect(fourth_label.reload.title).to match(/#{other_title}_duplicate/)
end
it 'restores renamed records' do
second_label_attributes = second_label.attributes
fourth_label_attributes = fourth_label.attributes
migration.up
migration.down
expect(second_label.attributes).to eq(second_label_attributes)
expect(fourth_label.attributes).to eq(fourth_label_attributes)
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment