Commit 73afd158 authored by nmilojevic1's avatar nmilojevic1 Committed by Qingyu Zhao

Add retry logic for failures during import

- Extract ImportFailure retry and log mechanism
- Try 3 times for exceptions:
     GRPC::DeadlineExceeded
     ActiveRecord::QueryCanceled
- Store each retry failure and retry_count to ImportFailure
- Retriable config can be set via ENV variables:
     RELATION_IMPORT_TRIES
     RELATION_IMPORT_BASE_INTERVAL
     RELATION_IMPORT_MULTIPLIER
     RELATION_IMPORT_RAND_FACTOR
- Migration scripts to:
     Add retry_count and group_id to ImportFailure table
     Add conditional index for project and group to ImportFailure table
parent 452910ec
...@@ -57,6 +57,8 @@ class Group < Namespace ...@@ -57,6 +57,8 @@ class Group < Namespace
has_one :import_export_upload has_one :import_export_upload
has_many :import_failures, inverse_of: :group
accepts_nested_attributes_for :variables, allow_destroy: true accepts_nested_attributes_for :variables, allow_destroy: true
validate :visibility_level_allowed_by_projects validate :visibility_level_allowed_by_projects
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
class ImportFailure < ApplicationRecord class ImportFailure < ApplicationRecord
belongs_to :project belongs_to :project
belongs_to :group
validates :project, presence: true validates :project, presence: true, unless: :group
validates :group, presence: true, unless: :project
end end
---
title: Add retry logic for failures during import
merge_request: 22265
author:
type: added
# frozen_string_literal: true
Retriable.configure do |config|
config.contexts[:relation_import] = {
tries: ENV.fetch('RELATION_IMPORT_TRIES', 3).to_i,
base_interval: ENV.fetch('RELATION_IMPORT_BASE_INTERVAL', 0.5).to_f,
multiplier: ENV.fetch('RELATION_IMPORT_MULTIPLIER', 1.5).to_f,
rand_factor: ENV.fetch('RELATION_IMPORT_RAND_FACTOR', 0.5).to_f,
on: Gitlab::ImportExport::ImportFailureService::RETRIABLE_EXCEPTIONS
}
end
# frozen_string_literal: true
class AddRetryCountAndGroupIdToImportFailures < ActiveRecord::Migration[5.2]
DOWNTIME = false
def change
add_column :import_failures, :retry_count, :integer
add_column :import_failures, :group_id, :integer
change_column_null(:import_failures, :project_id, true)
end
end
# frozen_string_literal: true
class AddGroupIndexAndFkToImportFailures < ActiveRecord::Migration[5.2]
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
GROUP_INDEX = 'index_import_failures_on_group_id_not_null'.freeze
disable_ddl_transaction!
def up
add_concurrent_index(:import_failures, :group_id, where: 'group_id IS NOT NULL', name: GROUP_INDEX)
add_concurrent_foreign_key(:import_failures, :namespaces, column: :group_id)
end
def down
remove_foreign_key(:import_failures, column: :group_id)
remove_concurrent_index_by_name(:import_failures, GROUP_INDEX)
end
end
# frozen_string_literal: true
class UpdateProjectIndexToImportFailures < ActiveRecord::Migration[5.2]
include Gitlab::Database::MigrationHelpers
# Set this constant to true if this migration requires downtime.
DOWNTIME = false
PROJECT_INDEX_OLD = 'index_import_failures_on_project_id'.freeze
PROJECT_INDEX_NEW = 'index_import_failures_on_project_id_not_null'.freeze
disable_ddl_transaction!
def up
add_concurrent_index(:import_failures, :project_id, where: 'project_id IS NOT NULL', name: PROJECT_INDEX_NEW)
remove_concurrent_index_by_name(:import_failures, PROJECT_INDEX_OLD)
end
def down
add_concurrent_index(:import_failures, :project_id, name: PROJECT_INDEX_OLD)
remove_concurrent_index_by_name(:import_failures, PROJECT_INDEX_NEW)
end
end
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 2020_01_14_204949) do ActiveRecord::Schema.define(version: 2020_01_17_112554) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "pg_trgm" enable_extension "pg_trgm"
...@@ -2036,14 +2036,17 @@ ActiveRecord::Schema.define(version: 2020_01_14_204949) do ...@@ -2036,14 +2036,17 @@ ActiveRecord::Schema.define(version: 2020_01_14_204949) do
create_table "import_failures", force: :cascade do |t| create_table "import_failures", force: :cascade do |t|
t.integer "relation_index" t.integer "relation_index"
t.bigint "project_id", null: false t.bigint "project_id"
t.datetime_with_timezone "created_at", null: false t.datetime_with_timezone "created_at", null: false
t.string "relation_key", limit: 64 t.string "relation_key", limit: 64
t.string "exception_class", limit: 128 t.string "exception_class", limit: 128
t.string "correlation_id_value", limit: 128 t.string "correlation_id_value", limit: 128
t.string "exception_message", limit: 255 t.string "exception_message", limit: 255
t.integer "retry_count"
t.integer "group_id"
t.index ["correlation_id_value"], name: "index_import_failures_on_correlation_id_value" t.index ["correlation_id_value"], name: "index_import_failures_on_correlation_id_value"
t.index ["project_id"], name: "index_import_failures_on_project_id" t.index ["group_id"], name: "index_import_failures_on_group_id_not_null", where: "(group_id IS NOT NULL)"
t.index ["project_id"], name: "index_import_failures_on_project_id_not_null", where: "(project_id IS NOT NULL)"
end end
create_table "index_statuses", id: :serial, force: :cascade do |t| create_table "index_statuses", id: :serial, force: :cascade do |t|
...@@ -4643,6 +4646,7 @@ ActiveRecord::Schema.define(version: 2020_01_14_204949) do ...@@ -4643,6 +4646,7 @@ ActiveRecord::Schema.define(version: 2020_01_14_204949) do
add_foreign_key "identities", "saml_providers", name: "fk_aade90f0fc", on_delete: :cascade add_foreign_key "identities", "saml_providers", name: "fk_aade90f0fc", on_delete: :cascade
add_foreign_key "import_export_uploads", "namespaces", column: "group_id", name: "fk_83319d9721", on_delete: :cascade add_foreign_key "import_export_uploads", "namespaces", column: "group_id", name: "fk_83319d9721", on_delete: :cascade
add_foreign_key "import_export_uploads", "projects", on_delete: :cascade add_foreign_key "import_export_uploads", "projects", on_delete: :cascade
add_foreign_key "import_failures", "namespaces", column: "group_id", name: "fk_24b824da43", on_delete: :cascade
add_foreign_key "index_statuses", "projects", name: "fk_74b2492545", on_delete: :cascade add_foreign_key "index_statuses", "projects", name: "fk_74b2492545", on_delete: :cascade
add_foreign_key "insights", "namespaces", on_delete: :cascade add_foreign_key "insights", "namespaces", on_delete: :cascade
add_foreign_key "insights", "projects", on_delete: :cascade add_foreign_key "insights", "projects", on_delete: :cascade
......
# frozen_string_literal: true
module Gitlab
module ImportExport
class ImportFailureService
RETRIABLE_EXCEPTIONS = [GRPC::DeadlineExceeded, ActiveRecord::QueryCanceled].freeze
attr_reader :importable
def initialize(importable)
@importable = importable
@association = importable.association(:import_failures)
end
def with_retry(relation_key, relation_index)
on_retry = -> (exception, retry_count, *_args) do
log_import_failure(relation_key, relation_index, exception, retry_count)
end
Retriable.with_context(:relation_import, on_retry: on_retry) do
yield
end
end
def log_import_failure(relation_key, relation_index, exception, retry_count = 0)
extra = {
relation_key: relation_key,
relation_index: relation_index,
retry_count: retry_count
}
extra[importable_column_name] = importable.id
Gitlab::ErrorTracking.track_exception(exception, extra)
attributes = {
exception_class: exception.class.to_s,
exception_message: exception.message.truncate(255),
correlation_id_value: Labkit::Correlation::CorrelationId.current_or_new_id
}.merge(extra)
ImportFailure.create(attributes)
end
private
def importable_column_name
@importable_column_name ||= @association.reflection.foreign_key.to_sym
end
end
end
end
...@@ -71,25 +71,18 @@ module Gitlab ...@@ -71,25 +71,18 @@ module Gitlab
return if importable_class == Project && group_model?(relation_object) return if importable_class == Project && group_model?(relation_object)
relation_object.assign_attributes(importable_class_sym => @importable) relation_object.assign_attributes(importable_class_sym => @importable)
relation_object.save!
import_failure_service.with_retry(relation_key, relation_index) do
relation_object.save!
end
save_id_mapping(relation_key, data_hash, relation_object) save_id_mapping(relation_key, data_hash, relation_object)
rescue => e rescue => e
log_import_failure(relation_key, relation_index, e) import_failure_service.log_import_failure(relation_key, relation_index, e)
end end
def log_import_failure(relation_key, relation_index, exception) def import_failure_service
Gitlab::ErrorTracking.track_exception(exception, @import_failure_service ||= ImportFailureService.new(@importable)
project_id: @importable.id, relation_key: relation_key, relation_index: relation_index)
ImportFailure.create(
project: @importable,
relation_key: relation_key,
relation_index: relation_index,
exception_class: exception.class.to_s,
exception_message: exception.message.truncate(255),
correlation_id_value: Labkit::Correlation::CorrelationId.current_or_new_id
)
end end
# Older, serialized CI pipeline exports may only have a # Older, serialized CI pipeline exports may only have a
......
# frozen_string_literal: true
require 'spec_helper'
describe Gitlab::ImportExport::ImportFailureService do
let(:importable) { create(:project, :builds_enabled, :issues_disabled, name: 'project', path: 'project') }
let(:label) { create(:label) }
let(:subject) { described_class.new(importable) }
let(:relation_key) { "labels" }
let(:relation_index) { 0 }
describe '#log_import_failure' do
let(:standard_error_message) { "StandardError message" }
let(:exception) { StandardError.new(standard_error_message) }
let(:correlation_id) { 'my-correlation-id' }
let(:retry_count) { 2 }
let(:log_import_failure) do
subject.log_import_failure(relation_key, relation_index, exception, retry_count)
end
before do
# Import is running from the rake task, `correlation_id` is not assigned
allow(Labkit::Correlation::CorrelationId).to receive(:current_or_new_id).and_return(correlation_id)
end
context 'when importable is a group' do
let(:importable) { create(:group) }
it_behaves_like 'log import failure', :group_id
end
context 'when importable is a project' do
it_behaves_like 'log import failure', :project_id
end
context 'when ImportFailure does not support importable class' do
let(:importable) { create(:merge_request) }
it 'raise exception' do
expect { subject }.to raise_exception(ActiveRecord::AssociationNotFoundError, "Association named 'import_failures' was not found on MergeRequest; perhaps you misspelled it?")
end
end
end
describe '#with_retry' do
let(:perform_retry) do
subject.with_retry(relation_key, relation_index) do
label.save!
end
end
context 'when exceptions are retriable' do
where(:exception) { Gitlab::ImportExport::ImportFailureService::RETRIABLE_EXCEPTIONS }
with_them do
context 'when retry succeeds' do
before do
response_values = [:raise, true]
allow(label).to receive(:save!).exactly(2).times do
value = response_values.shift
value == :raise ? raise(exception) : value
end
end
it 'retries 1 time' do
expect(label).to receive(:save!).and_raise(exception.new)
expect(label).to receive(:save!).and_return(true)
perform_retry
end
it 'retries and logs import failure once with correct params' do
expect(subject).to receive(:log_import_failure).with(relation_key, relation_index, instance_of(exception), 1)
perform_retry
end
end
context 'when retry continues to fail with intermittent errors' do
let(:maximum_retry_count) do
Retriable.config.tries
end
before do
expect(label).to receive(:save!)
.exactly(maximum_retry_count).times
.and_raise(exception.new)
end
it 'retries the number of times allowed and raise exception', :aggregate_failures do
expect { perform_retry }.to raise_exception(exception)
end
it 'logs import failure each time and raise exception', :aggregate_failures do
maximum_retry_count.times do |index|
retry_count = index + 1
expect(subject).to receive(:log_import_failure).with(relation_key, relation_index, instance_of(exception), retry_count)
end
expect { perform_retry }.to raise_exception(exception)
end
end
end
end
context 'when exception is not retriable' do
let(:exception) { StandardError.new }
before do
allow(label).to receive(:save!).and_raise(exception)
end
it 'raise the exception' do
aggregate_failures do
expect(label).to receive(:save!).once
expect(subject).not_to receive(:log_import_failure)
expect { perform_retry }.to raise_exception(exception)
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe ImportFailure do
describe "Associations" do
it { is_expected.to belong_to(:project) }
it { is_expected.to belong_to(:group) }
end
describe 'Validations' do
context 'has no group' do
before do
allow(subject).to receive(:group).and_return(nil)
end
it { is_expected.to validate_presence_of(:project) }
end
context 'has no project' do
before do
allow(subject).to receive(:project).and_return(nil)
end
it { is_expected.to validate_presence_of(:project) }
end
end
end
# frozen_string_literal: true
shared_examples 'log import failure' do |importable_column|
it 'tracks error' do
extra = {
relation_key: relation_key,
relation_index: relation_index,
retry_count: retry_count
}
extra[importable_column] = importable.id
expect(Gitlab::ErrorTracking).to receive(:track_exception).with(exception, extra)
subject.log_import_failure(relation_key, relation_index, exception, retry_count)
end
it 'saves data to ImportFailure' do
log_import_failure
import_failure = ImportFailure.last
aggregate_failures do
expect(import_failure[importable_column]).to eq(importable.id)
expect(import_failure.relation_key).to eq(relation_key)
expect(import_failure.relation_index).to eq(relation_index)
expect(import_failure.exception_class).to eq('StandardError')
expect(import_failure.exception_message).to eq(standard_error_message)
expect(import_failure.correlation_id_value).to eq(correlation_id)
expect(import_failure.retry_count).to eq(retry_count)
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment