Commit cf3fd0f0 authored by syasonik's avatar syasonik

Consistently handle recovery alerts

Allow alerts to close upon receiving a
recovery alert such that the auto-close
incident setting is only controlling incidents
rather than alerts. All recovery alerts should
resolve the corresponding GitLab alert.

Open and resolve self-recovery alerts for both
prometheus and generic alerts.
parent ab28e0be
...@@ -25,13 +25,6 @@ module AlertManagement ...@@ -25,13 +25,6 @@ module AlertManagement
attr_reader :project, :payload attr_reader :project, :payload
override :process_new_alert
def process_new_alert
return if resolving_alert?
super
end
override :incoming_payload override :incoming_payload
def incoming_payload def incoming_payload
strong_memoize(:incoming_payload) do strong_memoize(:incoming_payload) do
......
...@@ -19,11 +19,7 @@ module AlertManagement ...@@ -19,11 +19,7 @@ module AlertManagement
# Updates or creates alert from payload for project # Updates or creates alert from payload for project
# including system notes # including system notes
def process_alert def process_alert
if alert.persisted? alert.persisted? ? process_existing_alert : process_new_alert
process_existing_alert
else
process_new_alert
end
end end
# Creates or closes issue for alert and notifies stakeholders # Creates or closes issue for alert and notifies stakeholders
...@@ -33,22 +29,16 @@ module AlertManagement ...@@ -33,22 +29,16 @@ module AlertManagement
end end
def process_existing_alert def process_existing_alert
if resolving_alert? resolving_alert? ? process_resolved_alert : process_firing_alert
process_resolved_alert
else
process_firing_alert
end
end end
def process_resolved_alert def process_resolved_alert
SystemNoteService.log_resolving_alert(alert, alert_source) SystemNoteService.log_resolving_alert(alert, alert_source)
return unless auto_close_incident?
if alert.resolve(incoming_payload.ends_at) if alert.resolve(incoming_payload.ends_at)
SystemNoteService.change_alert_status(alert, User.alert_bot) SystemNoteService.change_alert_status(alert, User.alert_bot)
close_issue(alert.issue) close_issue(alert.issue) if auto_close_incident?
else else
logger.warn( logger.warn(
message: 'Unable to update AlertManagement::Alert status to resolved', message: 'Unable to update AlertManagement::Alert status to resolved',
...@@ -76,6 +66,8 @@ module AlertManagement ...@@ -76,6 +66,8 @@ module AlertManagement
if alert.save if alert.save
alert.execute_services alert.execute_services
SystemNoteService.create_new_alert(alert, alert_source) SystemNoteService.create_new_alert(alert, alert_source)
process_resolved_alert if resolving_alert?
else else
logger.warn( logger.warn(
message: "Unable to create AlertManagement::Alert from #{alert_source}", message: "Unable to create AlertManagement::Alert from #{alert_source}",
...@@ -128,7 +120,7 @@ module AlertManagement ...@@ -128,7 +120,7 @@ module AlertManagement
end end
def alert_source def alert_source
alert.monitoring_tool incoming_payload.monitoring_tool
end end
def logger def logger
......
---
title: Always resolve GitLab alerts when recovery alert payload is received
merge_request: 57302
author:
type: changed
...@@ -42,7 +42,7 @@ RSpec.describe AlertManagement::ProcessPrometheusAlertService do ...@@ -42,7 +42,7 @@ RSpec.describe AlertManagement::ProcessPrometheusAlertService do
context 'with resolving payload' do context 'with resolving payload' do
let(:payload) { raw_payload.merge('status' => 'resolved') } let(:payload) { raw_payload.merge('status' => 'resolved') }
include_examples 'oncall users are correctly notified of Prometheus recovery alert' include_examples 'oncall users are correctly notified of recovery alert'
end end
end end
end end
......
...@@ -33,33 +33,3 @@ RSpec.shared_examples 'oncall users are correctly notified of recovery alert' do ...@@ -33,33 +33,3 @@ RSpec.shared_examples 'oncall users are correctly notified of recovery alert' do
end end
end end
end end
RSpec.shared_examples 'oncall users are correctly notified of Prometheus recovery alert' do
it_behaves_like 'does not send on-call notification'
context 'when alert with the same fingerprint already exists' do
context 'which is triggered' do
let_it_be(:alert) { create(:alert_management_alert, :triggered, fingerprint: gitlab_fingerprint, project: project) }
it_behaves_like 'sends on-call notification if enabled'
end
context 'which is acknowledged' do
let_it_be(:alert) { create(:alert_management_alert, :acknowledged, fingerprint: gitlab_fingerprint, project: project) }
it_behaves_like 'sends on-call notification if enabled'
end
context 'which is resolved' do
let_it_be(:alert) { create(:alert_management_alert, :resolved, fingerprint: gitlab_fingerprint, project: project) }
it_behaves_like 'does not send on-call notification'
end
context 'which is ignored' do
let_it_be(:alert) { create(:alert_management_alert, :ignored, fingerprint: gitlab_fingerprint, project: project) }
it_behaves_like 'sends on-call notification if enabled'
end
end
end
...@@ -48,7 +48,7 @@ RSpec.describe AlertManagement::ProcessPrometheusAlertService do ...@@ -48,7 +48,7 @@ RSpec.describe AlertManagement::ProcessPrometheusAlertService do
context 'with resolving payload' do context 'with resolving payload' do
let(:prometheus_status) { 'resolved' } let(:prometheus_status) { 'resolved' }
it_behaves_like 'processes prometheus recovery alert' it_behaves_like 'processes recovery alert'
end end
context 'environment given' do context 'environment given' do
......
...@@ -111,67 +111,3 @@ RSpec.shared_examples 'processes recovery alert' do ...@@ -111,67 +111,3 @@ RSpec.shared_examples 'processes recovery alert' do
end end
end end
end end
RSpec.shared_examples 'processes prometheus recovery alert' do
context 'seen for the first time' do
it_behaves_like 'does not create an alert management alert'
it_behaves_like 'does not send alert notification emails'
it_behaves_like 'does not process incident issues'
end
context 'for an existing alert with the same fingerprint' do
let_it_be(:gitlab_fingerprint) { Digest::SHA1.hexdigest(fingerprint) }
context 'which is triggered' do
let_it_be(:alert) { create(:alert_management_alert, :triggered, project: project, fingerprint: gitlab_fingerprint, monitoring_tool: source) }
it_behaves_like 'resolves an existing alert management alert'
it_behaves_like 'creates expected system notes for alert', :recovery_alert, :resolve_alert
it_behaves_like 'sends alert notification emails if enabled'
it_behaves_like 'closes related incident if enabled'
it_behaves_like 'writes a warning to the log for a failed alert status update'
it_behaves_like 'does not create an alert management alert'
it_behaves_like 'does not process incident issues'
it_behaves_like 'does not add an alert management alert event'
end
context 'which is ignored' do
let_it_be(:alert) { create(:alert_management_alert, :ignored, project: project, fingerprint: gitlab_fingerprint, monitoring_tool: source) }
it_behaves_like 'resolves an existing alert management alert'
it_behaves_like 'creates expected system notes for alert', :recovery_alert, :resolve_alert
it_behaves_like 'sends alert notification emails if enabled'
it_behaves_like 'closes related incident if enabled'
it_behaves_like 'writes a warning to the log for a failed alert status update'
it_behaves_like 'does not create an alert management alert'
it_behaves_like 'does not process incident issues'
it_behaves_like 'does not add an alert management alert event'
end
context 'which is acknowledged' do
let_it_be(:alert) { create(:alert_management_alert, :acknowledged, project: project, fingerprint: gitlab_fingerprint, monitoring_tool: source) }
it_behaves_like 'resolves an existing alert management alert'
it_behaves_like 'creates expected system notes for alert', :recovery_alert, :resolve_alert
it_behaves_like 'sends alert notification emails if enabled'
it_behaves_like 'closes related incident if enabled'
it_behaves_like 'writes a warning to the log for a failed alert status update'
it_behaves_like 'does not create an alert management alert'
it_behaves_like 'does not process incident issues'
it_behaves_like 'does not add an alert management alert event'
end
context 'which is resolved' do
let_it_be(:alert) { create(:alert_management_alert, :resolved, project: project, fingerprint: gitlab_fingerprint, monitoring_tool: source) }
it_behaves_like 'does not create an alert management alert'
it_behaves_like 'does not send alert notification emails'
it_behaves_like 'does not change the alert end time'
it_behaves_like 'does not process incident issues'
it_behaves_like 'does not add an alert management alert event'
end
end
end
...@@ -48,9 +48,19 @@ end ...@@ -48,9 +48,19 @@ end
RSpec.shared_examples 'processes never-before-seen recovery alert' do RSpec.shared_examples 'processes never-before-seen recovery alert' do
it_behaves_like 'creates an alert management alert or errors' it_behaves_like 'creates an alert management alert or errors'
it_behaves_like 'creates expected system notes for alert', :new_alert it_behaves_like 'creates expected system notes for alert', :new_alert, :recovery_alert, :resolve_alert
it_behaves_like 'sends alert notification emails if enabled' it_behaves_like 'sends alert notification emails if enabled'
it_behaves_like 'processes incident issues if enabled' it_behaves_like 'does not process incident issues'
it_behaves_like 'writes a warning to the log for a failed alert status update' do
let(:alert) { nil } # Ensure the next alert id is used
end
it 'resolves the alert' do
subject
expect(AlertManagement::Alert.last.ended_at).to be_present
expect(AlertManagement::Alert.last.resolved?).to be(true)
end
end end
RSpec.shared_examples 'processes one firing and one resolved prometheus alerts' do RSpec.shared_examples 'processes one firing and one resolved prometheus alerts' do
...@@ -58,10 +68,10 @@ RSpec.shared_examples 'processes one firing and one resolved prometheus alerts' ...@@ -58,10 +68,10 @@ RSpec.shared_examples 'processes one firing and one resolved prometheus alerts'
expect(Gitlab::AppLogger).not_to receive(:warn) expect(Gitlab::AppLogger).not_to receive(:warn)
expect { subject } expect { subject }
.to change(AlertManagement::Alert, :count).by(1) .to change(AlertManagement::Alert, :count).by(2)
.and change(Note, :count).by(1) .and change(Note, :count).by(4)
end end
it_behaves_like 'processes incident issues' it_behaves_like 'processes incident issues'
it_behaves_like 'sends alert notification emails', count: 1 it_behaves_like 'sends alert notification emails', count: 2
end end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment