Commit acec32bd authored by Peter Leitzen's avatar Peter Leitzen Committed by Kamil Trzciński

Resolve "Persist Prometheus alert events"

parent c77896f3
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
# #
# It's strongly recommended that you check this file into your version control system. # It's strongly recommended that you check this file into your version control system.
ActiveRecord::Schema.define(version: 20180924141949) do ActiveRecord::Schema.define(version: 20180926140319) do
# These are extensions that must be enabled in order to support this database # These are extensions that must be enabled in order to support this database
enable_extension "plpgsql" enable_extension "plpgsql"
...@@ -2273,6 +2273,18 @@ ActiveRecord::Schema.define(version: 20180924141949) do ...@@ -2273,6 +2273,18 @@ ActiveRecord::Schema.define(version: 20180924141949) do
add_index "projects", ["star_count"], name: "index_projects_on_star_count", using: :btree add_index "projects", ["star_count"], name: "index_projects_on_star_count", using: :btree
add_index "projects", ["visibility_level"], name: "index_projects_on_visibility_level", using: :btree add_index "projects", ["visibility_level"], name: "index_projects_on_visibility_level", using: :btree
create_table "prometheus_alert_events", id: :bigserial, force: :cascade do |t|
t.integer "project_id", null: false
t.integer "prometheus_alert_id", null: false
t.datetime_with_timezone "started_at", null: false
t.datetime_with_timezone "ended_at"
t.integer "status", limit: 2
t.string "payload_key"
end
add_index "prometheus_alert_events", ["project_id", "status"], name: "index_prometheus_alert_events_on_project_id_and_status", using: :btree
add_index "prometheus_alert_events", ["prometheus_alert_id", "payload_key"], name: "index_prometheus_alert_event_scoped_payload_key", unique: true, using: :btree
create_table "prometheus_alerts", force: :cascade do |t| create_table "prometheus_alerts", force: :cascade do |t|
t.datetime_with_timezone "created_at", null: false t.datetime_with_timezone "created_at", null: false
t.datetime_with_timezone "updated_at", null: false t.datetime_with_timezone "updated_at", null: false
...@@ -3241,6 +3253,8 @@ ActiveRecord::Schema.define(version: 20180924141949) do ...@@ -3241,6 +3253,8 @@ ActiveRecord::Schema.define(version: 20180924141949) do
add_foreign_key "project_mirror_data", "projects", name: "fk_d1aad367d7", on_delete: :cascade add_foreign_key "project_mirror_data", "projects", name: "fk_d1aad367d7", on_delete: :cascade
add_foreign_key "project_repository_states", "projects", on_delete: :cascade add_foreign_key "project_repository_states", "projects", on_delete: :cascade
add_foreign_key "project_statistics", "projects", on_delete: :cascade add_foreign_key "project_statistics", "projects", on_delete: :cascade
add_foreign_key "prometheus_alert_events", "projects", on_delete: :cascade
add_foreign_key "prometheus_alert_events", "prometheus_alerts", on_delete: :cascade
add_foreign_key "prometheus_alerts", "environments", on_delete: :cascade add_foreign_key "prometheus_alerts", "environments", on_delete: :cascade
add_foreign_key "prometheus_alerts", "projects", on_delete: :cascade add_foreign_key "prometheus_alerts", "projects", on_delete: :cascade
add_foreign_key "prometheus_alerts", "prometheus_metrics", on_delete: :cascade add_foreign_key "prometheus_alerts", "prometheus_metrics", on_delete: :cascade
......
...@@ -89,11 +89,9 @@ module Projects ...@@ -89,11 +89,9 @@ module Projects
PrometheusAlertSerializer.new(project: project, current_user: current_user) PrometheusAlertSerializer.new(project: project, current_user: current_user)
end end
# rubocop: disable CodeReuse/ActiveRecord
def alert def alert
@alert ||= project.prometheus_alerts.find_by(prometheus_metric_id: params[:id]) || render_404 @alert ||= project.prometheus_alerts.for_metric(params[:id]).first || render_404
end end
# rubocop: enable CodeReuse/ActiveRecord
def application def application
@application ||= alert.environment.cluster_prometheus_adapter @application ||= alert.environment.cluster_prometheus_adapter
......
...@@ -48,6 +48,7 @@ module EE ...@@ -48,6 +48,7 @@ module EE
has_many :source_pipelines, class_name: 'Ci::Sources::Pipeline', foreign_key: :project_id has_many :source_pipelines, class_name: 'Ci::Sources::Pipeline', foreign_key: :project_id
has_many :prometheus_alerts, inverse_of: :project has_many :prometheus_alerts, inverse_of: :project
has_many :prometheus_alert_events, inverse_of: :project
scope :with_shared_runners_limit_enabled, -> { with_shared_runners.non_public_only } scope :with_shared_runners_limit_enabled, -> { with_shared_runners.non_public_only }
......
...@@ -9,6 +9,8 @@ class PrometheusAlert < ActiveRecord::Base ...@@ -9,6 +9,8 @@ class PrometheusAlert < ActiveRecord::Base
belongs_to :project, required: true, validate: true, inverse_of: :prometheus_alerts belongs_to :project, required: true, validate: true, inverse_of: :prometheus_alerts
belongs_to :prometheus_metric, required: true, validate: true, inverse_of: :prometheus_alerts belongs_to :prometheus_metric, required: true, validate: true, inverse_of: :prometheus_alerts
has_many :prometheus_alert_events, inverse_of: :prometheus_alert
after_save :clear_prometheus_adapter_cache! after_save :clear_prometheus_adapter_cache!
after_destroy :clear_prometheus_adapter_cache! after_destroy :clear_prometheus_adapter_cache!
...@@ -19,6 +21,8 @@ class PrometheusAlert < ActiveRecord::Base ...@@ -19,6 +21,8 @@ class PrometheusAlert < ActiveRecord::Base
delegate :title, :query, to: :prometheus_metric delegate :title, :query, to: :prometheus_metric
scope :for_metric, -> (metric) { where(prometheus_metric: metric) }
def self.operator_to_enum(op) def self.operator_to_enum(op)
OPERATORS_MAP.invert.fetch(op) OPERATORS_MAP.invert.fetch(op)
end end
......
# frozen_string_literal: true
class PrometheusAlertEvent < ActiveRecord::Base
belongs_to :project, required: true, validate: true, inverse_of: :prometheus_alert_events
belongs_to :prometheus_alert, required: true, validate: true, inverse_of: :prometheus_alert_events
validates :payload_key, uniqueness: { scope: :prometheus_alert_id }
validates :started_at, presence: true
validates :status, presence: true
delegate :title, :prometheus_metric_id, to: :prometheus_alert
state_machine :status, initial: :none do
state :none, value: nil
state :firing, value: 0 do
validates :payload_key, presence: true
validates :ended_at, absence: true
end
state :resolved, value: 1 do
validates :payload_key, absence: true
validates :ended_at, presence: true
end
event :fire do
transition none: :firing
end
event :resolve do
transition firing: :resolved
end
before_transition to: :firing do |alert_event, transition|
started_at = transition.args.first
alert_event.started_at = started_at
end
before_transition to: :resolved do |alert_event, transition|
ended_at = transition.args.first
alert_event.payload_key = nil
alert_event.ended_at = ended_at
end
end
scope :firing, -> { where(status: status_value_for(:firing)) }
scope :resolved, -> { where(status: status_value_for(:resolved)) }
def self.find_or_initialize_by_payload_key(project, alert, payload_key)
find_or_initialize_by(project: project, prometheus_alert: alert, payload_key: payload_key)
end
def self.status_value_for(name)
state_machines[:status].states[name].value
end
def self.payload_key_for(gitlab_alert_id, started_at)
plain = [gitlab_alert_id, started_at].join('/')
Digest::SHA1.hexdigest(plain)
end
end
# frozen_string_literal: true
module Projects
module Prometheus
module Alerts
# Persists a series of Prometheus alert events as list of PrometheusAlertEvent.
class CreateEventsService < BaseService
def execute
create_events_from(alerts)
end
private
def create_events_from(alerts)
Array.wrap(alerts).map { |alert| create_event(alert) }.compact
end
def create_event(payload)
return unless payload.respond_to?(:dig)
status = payload.dig('status')
return unless status
started_at = validate_date(payload['startsAt'])
return unless started_at
ended_at = validate_date(payload['endsAt'])
return unless ended_at
gitlab_alert_id = payload.dig('labels', 'gitlab_alert_id')
return unless gitlab_alert_id
alert = project.prometheus_alerts.for_metric(gitlab_alert_id).first
return unless alert
payload_key = PrometheusAlertEvent.payload_key_for(gitlab_alert_id, started_at)
event = PrometheusAlertEvent.find_or_initialize_by_payload_key(project, alert, payload_key)
result = case status
when 'firing'
event.fire(started_at)
when 'resolved'
event.resolve(ended_at)
end
event if result
end
def alerts
params['alerts']
end
def validate_date(date)
return unless date
Time.parse(date)
date
rescue ArgumentError
end
end
end
end
end
...@@ -9,6 +9,8 @@ module Projects ...@@ -9,6 +9,8 @@ module Projects
notification_service.async.prometheus_alerts_fired(project, firings) if firings.any? notification_service.async.prometheus_alerts_fired(project, firings) if firings.any?
persist_events(project, current_user, params)
true true
end end
...@@ -19,12 +21,20 @@ module Projects ...@@ -19,12 +21,20 @@ module Projects
end end
def alerts_by_status(status) def alerts_by_status(status)
params['alerts'].select { |alert| alert['status'] == status } alerts.select { |alert| alert['status'] == status }
end
def alerts
params['alerts']
end end
def valid? def valid?
params['version'] == '4' params['version'] == '4'
end end
def persist_events(project, current_user, params)
CreateEventsService.new(project, current_user, params).execute
end
end end
end end
end end
......
---
title: Persist Prometheus alert events
merge_request: 7493
author:
type: added
# frozen_string_literal: true
class CreatePrometheusAlertEvents < ActiveRecord::Migration
include Gitlab::Database::MigrationHelpers
DOWNTIME = false
INDEX_ALERT_PAYLOAD_KEY = 'index_prometheus_alert_event_scoped_payload_key'
def change
create_table :prometheus_alert_events, id: :bigserial do |t|
t.references :project, null: false, foreign_key: { on_delete: :cascade }
t.references :prometheus_alert, null: false, foreign_key: { on_delete: :cascade }
t.datetime_with_timezone :started_at, null: false
t.datetime_with_timezone :ended_at
t.integer :status, limit: 2
t.string :payload_key
t.index [:project_id, :status]
t.index [:prometheus_alert_id, :payload_key], unique: true, name: INDEX_ALERT_PAYLOAD_KEY
end
end
end
# frozen_string_literal: true
FactoryBot.define do
factory :prometheus_alert_event do
project { prometheus_alert.project }
prometheus_alert
sequence(:payload_key) { |n| "hash payload key #{n}" }
status { PrometheusAlertEvent.status_value_for(:firing) }
started_at { Time.now }
trait :resolved do
status { PrometheusAlertEvent.status_value_for(:resolved) }
ended_at { Time.now }
payload_key nil
end
trait :none do
status nil
started_at nil
end
end
end
...@@ -57,6 +57,7 @@ project: ...@@ -57,6 +57,7 @@ project:
- vulnerability_identifiers - vulnerability_identifiers
- vulnerability_scanners - vulnerability_scanners
- prometheus_alerts - prometheus_alerts
- prometheus_alert_events
- software_license_policies - software_license_policies
- project_registry - project_registry
- packages - packages
...@@ -65,6 +66,9 @@ prometheus_metrics: ...@@ -65,6 +66,9 @@ prometheus_metrics:
- prometheus_alerts - prometheus_alerts
prometheus_alerts: prometheus_alerts:
- project - project
- prometheus_alert_events
prometheus_alert_events:
- project
epic_issues: epic_issues:
- issue - issue
- epic - epic
# frozen_string_literal: true
require 'spec_helper'
describe PrometheusAlertEvent do
subject { build(:prometheus_alert_event) }
let(:alert) { subject.prometheus_alert }
describe 'associations' do
it { is_expected.to belong_to(:prometheus_alert) }
end
describe 'validations' do
it { is_expected.to be_valid }
it { is_expected.to validate_presence_of(:prometheus_alert) }
it { is_expected.to validate_uniqueness_of(:payload_key).scoped_to(:prometheus_alert_id) }
it { is_expected.to validate_presence_of(:started_at) }
describe 'payload_key & ended_at' do
context 'absent if firing?' do
subject { build(:prometheus_alert_event) }
it { is_expected.to validate_presence_of(:payload_key) }
it { is_expected.not_to validate_presence_of(:ended_at) }
end
context 'present if resolved?' do
subject { build(:prometheus_alert_event, :resolved) }
it { is_expected.not_to validate_presence_of(:payload_key) }
it { is_expected.to validate_presence_of(:ended_at) }
end
end
end
describe '#title' do
it 'delegates to alert' do
expect(subject.title).to eq(alert.title)
end
end
describe 'prometheus_metric_id' do
it 'delegates to alert' do
expect(subject.prometheus_metric_id).to eq(alert.prometheus_metric_id)
end
end
describe 'transaction' do
describe 'fire' do
let(:started_at) { Time.now }
context 'when status is none' do
subject { build(:prometheus_alert_event, :none) }
it 'it fires an event' do
result = subject.fire(started_at)
expect(result).to eq(true)
expect(subject).to be_firing
expect(subject.started_at).to eq(started_at)
end
end
context 'when firing' do
subject { build(:prometheus_alert_event) }
it 'cannot fire again' do
result = subject.fire(started_at)
expect(result).to eq(false)
end
end
end
describe 'resolve' do
let(:ended_at) { Time.now }
context 'when firing' do
subject { build(:prometheus_alert_event) }
it 'it resolves an event' do
result = subject.resolve!(ended_at)
expect(result).to eq(true)
expect(subject).to be_resolved
expect(subject.ended_at).to eq(ended_at)
expect(subject.payload_key).to be_nil
end
end
context 'when resolved' do
subject { build(:prometheus_alert_event, :resolved) }
it 'cannot resolve again' do
result = subject.resolve(ended_at)
expect(result).to eq(false)
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Projects::Prometheus::Alerts::CreateEventsService do
let(:user) { create(:user) }
set(:project) { create(:project) }
let(:metric) { create(:prometheus_metric, project: project) }
let(:service) { described_class.new(project, user, alerts_payload) }
shared_examples 'events persisted' do |expected_count|
subject { service.execute }
it 'returns proper amount of created events' do
expect(subject.size).to eq(expected_count)
end
it 'increments event count' do
expect { subject }.to change { PrometheusAlertEvent.count }.to(expected_count)
end
end
shared_examples 'no events persisted' do
subject { service.execute }
it 'returns no created events' do
expect(subject).to be_empty
end
it 'does not change event count' do
expect { subject }.not_to change { PrometheusAlertEvent.count }
end
end
context 'with valid alerts_payload' do
let!(:alert) { create(:prometheus_alert, prometheus_metric: metric, project: project) }
let(:events) { service.execute }
context 'with a firing payload' do
let(:started_at) { truncate_to_second(Time.now) }
let(:firing_event) { alert_payload(status: 'firing', started_at: started_at) }
let(:alerts_payload) { { 'alerts' => [firing_event] } }
it_behaves_like 'events persisted', 1
it 'returns created event' do
event = events.first
expect(event).to be_firing
expect(event.started_at).to eq(started_at)
expect(event.ended_at).to be_nil
end
context 'with 2 different firing events' do
let(:another_firing_event) { alert_payload(status: 'firing', started_at: started_at + 1) }
let(:alerts_payload) { { 'alerts' => [firing_event, another_firing_event] } }
it_behaves_like 'events persisted', 2
end
context 'with already persisted firing event' do
before do
service.execute
end
it_behaves_like 'no events persisted'
end
context 'with duplicate payload' do
let(:alerts_payload) { { 'alerts' => [firing_event, firing_event] } }
it_behaves_like 'events persisted', 1
end
end
context 'with a resolved payload' do
let(:started_at) { truncate_to_second(Time.now) }
let(:ended_at) { started_at + 1 }
let(:payload_key) { PrometheusAlertEvent.payload_key_for(alert.prometheus_metric_id, utc_rfc3339(started_at)) }
let(:resolved_event) { alert_payload(status: 'resolved', started_at: started_at, ended_at: ended_at) }
let(:alerts_payload) { { 'alerts' => [resolved_event] } }
context 'with a matching firing event' do
before do
create(:prometheus_alert_event,
prometheus_alert: alert,
payload_key: payload_key,
started_at: started_at)
end
it 'it does not create an additional event' do
expect { service.execute }.not_to change { PrometheusAlertEvent.count }
end
it 'marks firing event as `resolved`' do
expect(events.size).to eq(1)
event = events.first
expect(event).to be_resolved
expect(event.started_at).to eq(started_at)
expect(event.ended_at).to eq(ended_at)
end
context 'with duplicate payload' do
let(:alerts_payload) { { 'alerts' => [resolved_event, resolved_event] } }
it 'it does not create an additional event' do
expect { service.execute }.not_to change { PrometheusAlertEvent.count }
end
it 'marks firing event as `resolved` only once' do
expect(events.size).to eq(1)
end
end
end
context 'without a matching firing event' do
context 'due to payload_key' do
let(:payload_key) { 'some other payload_key' }
before do
create(:prometheus_alert_event,
prometheus_alert: alert,
payload_key: payload_key,
started_at: started_at)
end
it_behaves_like 'no events persisted'
end
context 'due to status' do
before do
create(:prometheus_alert_event, :resolved,
prometheus_alert: alert,
started_at: started_at)
end
it_behaves_like 'no events persisted'
end
end
context 'with already resolved event' do
before do
service.execute
end
it_behaves_like 'no events persisted'
end
end
context 'with a metric from another project' do
let(:another_project) { create(:project) }
let(:metric) { create(:prometheus_metric, project: another_project) }
let(:alerts_payload) { { 'alerts' => [alert_payload] } }
let!(:alert) do
create(:prometheus_alert,
prometheus_metric: metric,
project: another_project)
end
it_behaves_like 'no events persisted'
end
end
context 'with invalid payload' do
let(:alert) { create(:prometheus_alert, prometheus_metric: metric, project: project) }
describe '`alerts` key' do
context 'is missing' do
let(:alerts_payload) { {} }
it_behaves_like 'no events persisted'
end
context 'is nil' do
let(:alerts_payload) { { 'alerts' => nil } }
it_behaves_like 'no events persisted'
end
context 'is empty' do
let(:alerts_payload) { { 'alerts' => [] } }
it_behaves_like 'no events persisted'
end
context 'is not a Hash' do
let(:alerts_payload) { { 'alerts' => [:not_a_hash] } }
it_behaves_like 'no events persisted'
end
describe '`status`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(status: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(status: 'invalid')] } }
it_behaves_like 'no events persisted'
end
end
describe '`started_at`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(started_at: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(started_at: 'invalid date')] } }
it_behaves_like 'no events persisted'
end
end
describe '`ended_at`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(ended_at: 'invalid date')] } }
it_behaves_like 'no events persisted'
end
end
describe '`labels`' do
describe '`gitlab_alert_id`' do
context 'is missing' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: nil)] } }
it_behaves_like 'no events persisted'
end
context 'is invalid' do
let(:alerts_payload) { { 'alerts' => [alert_payload(gitlab_alert_id: '-1')] } }
it_behaves_like 'no events persisted'
end
end
end
end
end
private
def alert_payload(status: 'firing', started_at: Time.now, ended_at: Time.now, gitlab_alert_id: alert.prometheus_metric_id)
payload = {}
payload['status'] = status if status
payload['startsAt'] = utc_rfc3339(started_at) if started_at
payload['endsAt'] = utc_rfc3339(ended_at) if ended_at
payload['labels'] = { 'gitlab_alert_id' => gitlab_alert_id.to_s } if gitlab_alert_id
payload
end
# Example: 2018-09-27T18:25:31.079079416Z
def utc_rfc3339(date)
date.utc.strftime("%FT%T.%9NZ")
rescue
date
end
def truncate_to_second(date)
date.change(usec: 0)
end
end
...@@ -11,11 +11,14 @@ describe Projects::Prometheus::Alerts::NotifyService do ...@@ -11,11 +11,14 @@ describe Projects::Prometheus::Alerts::NotifyService do
let(:alert_firing) { create(:prometheus_alert, project: project) } let(:alert_firing) { create(:prometheus_alert, project: project) }
let(:alert_resolved) { create(:prometheus_alert, project: project) } let(:alert_resolved) { create(:prometheus_alert, project: project) }
let(:notification_service) { spy } let(:notification_service) { spy }
let(:create_events_service) { spy }
let(:payload) { payload_for(firing: [alert_firing], resolved: [alert_resolved]) } let(:payload) { payload_for(firing: [alert_firing], resolved: [alert_resolved]) }
let(:payload_alert_firing) { payload['alerts'].first } let(:payload_alert_firing) { payload['alerts'].first }
before do before do
allow(NotificationService).to receive(:new).and_return(notification_service) allow(NotificationService).to receive(:new).and_return(notification_service)
allow(Projects::Prometheus::Alerts::CreateEventsService)
.to receive(:new).and_return(create_events_service)
end end
it 'sends a notification for firing alerts only' do it 'sends a notification for firing alerts only' do
...@@ -25,6 +28,12 @@ describe Projects::Prometheus::Alerts::NotifyService do ...@@ -25,6 +28,12 @@ describe Projects::Prometheus::Alerts::NotifyService do
expect(service.execute).to eq(true) expect(service.execute).to eq(true)
end end
it 'persists events' do
expect(create_events_service).to receive(:execute)
expect(service.execute).to eq(true)
end
end end
context 'with invalid payload' do context 'with invalid payload' do
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment