Commit 2ed76dc1 authored by João Cunha's avatar João Cunha Committed by Heinrich Lee Yu

Delete kubernetes cluster and resources

- Adds Clusters::Cluster.with_persisted_applications scope
- Create workers to remove cluster and resources
- Ignore non existing metrics on Prometheus uninstallation
- Inform frontend of cluster removing status by the status_name attr
- Guarantee kubeclient core can delete namespace and SA
- Create cluster cleanup_status state machine
- Create cleanup_status_reason for error description
parent 9230e97b
......@@ -172,11 +172,7 @@ class Clusters::ClustersController < Clusters::BaseController
private
def destroy_params
# To be uncomented on https://gitlab.com/gitlab-org/gitlab/merge_requests/16954
# This MR got split into other since it was too big.
#
# params.permit(:cleanup)
{}
params.permit(:cleanup)
end
def update_params
......
......@@ -23,6 +23,7 @@ module Clusters
}.freeze
DEFAULT_ENVIRONMENT = '*'
KUBE_INGRESS_BASE_DOMAIN = 'KUBE_INGRESS_BASE_DOMAIN'
APPLICATIONS_ASSOCIATIONS = APPLICATIONS.values.map(&:association_name).freeze
belongs_to :user
belongs_to :management_project, class_name: '::Project', optional: true
......@@ -117,7 +118,7 @@ module Clusters
scope :aws_installed, -> { aws_provided.joins(:provider_aws).merge(Clusters::Providers::Aws.with_status(:created)) }
scope :managed, -> { where(managed: true) }
scope :with_persisted_applications, -> { eager_load(*APPLICATIONS_ASSOCIATIONS) }
scope :default_environment, -> { where(environment_scope: DEFAULT_ENVIRONMENT) }
scope :for_project_namespace, -> (namespace_id) { joins(:projects).where(projects: { namespace_id: namespace_id }) }
......@@ -195,9 +196,13 @@ module Clusters
{ connection_status: retrieve_connection_status }
end
def persisted_applications
APPLICATIONS_ASSOCIATIONS.map(&method(:public_send)).compact
end
def applications
APPLICATIONS.values.map do |application_class|
public_send(application_class.association_name) || public_send("build_#{application_class.association_name}") # rubocop:disable GitlabSecurity/PublicSend
APPLICATIONS_ASSOCIATIONS.map do |association_name|
public_send(association_name) || public_send("build_#{association_name}") # rubocop:disable GitlabSecurity/PublicSend
end
end
......
# frozen_string_literal: true
module Clusters
module Cleanup
class AppService < Clusters::Cleanup::BaseService
def execute
persisted_applications = @cluster.persisted_applications
persisted_applications.each do |app|
next unless app.available?
next unless app.can_uninstall?
log_event(:uninstalling_app, application: app.class.application_name)
uninstall_app_async(app)
end
# Keep calling the worker untill all dependencies are uninstalled
return schedule_next_execution(Clusters::Cleanup::AppWorker) if persisted_applications.any?
log_event(:schedule_remove_project_namespaces)
cluster.continue_cleanup!
end
private
def uninstall_app_async(application)
application.make_scheduled!
Clusters::Applications::UninstallWorker.perform_async(application.name, application.id)
end
end
end
end
# frozen_string_literal: true
module Clusters
module Cleanup
class BaseService
DEFAULT_EXECUTION_INTERVAL = 1.minute
def initialize(cluster, execution_count = 0)
@cluster = cluster
@execution_count = execution_count
end
private
attr_reader :cluster
def logger
@logger ||= Gitlab::Kubernetes::Logger.build
end
def log_event(event, extra_data = {})
meta = {
service: self.class.name,
cluster_id: cluster.id,
execution_count: @execution_count,
event: event
}
logger.info(meta.merge(extra_data))
end
def schedule_next_execution(worker_class)
log_event(:scheduling_execution, next_execution: @execution_count + 1)
worker_class.perform_in(execution_interval, cluster.id, @execution_count + 1)
end
# Override this method to customize the execution interval
def execution_interval
DEFAULT_EXECUTION_INTERVAL
end
end
end
end
# frozen_string_literal: true
module Clusters
module Cleanup
class ProjectNamespaceService < BaseService
KUBERNETES_NAMESPACE_BATCH_SIZE = 100
def execute
delete_project_namespaces_in_batches
# Keep calling the worker untill all namespaces are deleted
if cluster.kubernetes_namespaces.exists?
return schedule_next_execution(Clusters::Cleanup::ProjectNamespaceWorker)
end
cluster.continue_cleanup!
end
private
def delete_project_namespaces_in_batches
kubernetes_namespaces_batch = cluster.kubernetes_namespaces.first(KUBERNETES_NAMESPACE_BATCH_SIZE)
kubernetes_namespaces_batch.each do |kubernetes_namespace|
log_event(:deleting_project_namespace, namespace: kubernetes_namespace.namespace)
begin
kubeclient_delete_namespace(kubernetes_namespace)
rescue Kubeclient::HttpError
next
end
kubernetes_namespace.destroy!
end
end
def kubeclient_delete_namespace(kubernetes_namespace)
cluster.kubeclient.delete_namespace(kubernetes_namespace.namespace)
rescue Kubeclient::ResourceNotFoundError
# no-op: nothing to delete
end
end
end
end
# frozen_string_literal: true
module Clusters
module Cleanup
class ServiceAccountService < BaseService
def execute
delete_gitlab_service_account
log_event(:destroying_cluster)
cluster.destroy!
end
private
def delete_gitlab_service_account
log_event(:deleting_gitlab_service_account)
cluster.kubeclient.delete_service_account(
::Clusters::Kubernetes::GITLAB_SERVICE_ACCOUNT_NAME,
::Clusters::Kubernetes::GITLAB_SERVICE_ACCOUNT_NAMESPACE
)
rescue Kubeclient::ResourceNotFoundError
end
end
end
end
......@@ -38,6 +38,9 @@
- gcp_cluster:cluster_patch_app
- gcp_cluster:cluster_upgrade_app
- gcp_cluster:cluster_provision
- gcp_cluster:clusters_cleanup_app
- gcp_cluster:clusters_cleanup_project_namespace
- gcp_cluster:clusters_cleanup_service_account
- gcp_cluster:cluster_wait_for_app_installation
- gcp_cluster:wait_for_cluster_creation
- gcp_cluster:cluster_wait_for_ingress_ip_address
......
......@@ -3,13 +3,16 @@
module Clusters
module Cleanup
class AppWorker
include ApplicationWorker
include ClusterQueue
include ClusterApplications
include ClusterCleanupMethods
# TODO: Merge with https://gitlab.com/gitlab-org/gitlab/merge_requests/16954
# We're splitting the above MR in smaller chunks to facilitate reviews
def perform
def perform(cluster_id, execution_count = 0)
Clusters::Cluster.with_persisted_applications.find_by_id(cluster_id).try do |cluster|
break unless cluster.cleanup_uninstalling_applications?
break exceeded_execution_limit(cluster) if exceeded_execution_limit?(execution_count)
::Clusters::Cleanup::AppService.new(cluster, execution_count).execute
end
end
end
end
......
......@@ -3,13 +3,16 @@
module Clusters
module Cleanup
class ProjectNamespaceWorker
include ApplicationWorker
include ClusterQueue
include ClusterApplications
include ClusterCleanupMethods
# TODO: Merge with https://gitlab.com/gitlab-org/gitlab/merge_requests/16954
# We're splitting the above MR in smaller chunks to facilitate reviews
def perform
def perform(cluster_id, execution_count = 0)
Clusters::Cluster.find_by_id(cluster_id).try do |cluster|
break unless cluster.cleanup_removing_project_namespaces?
break exceeded_execution_limit(cluster) if exceeded_execution_limit?(execution_count)
Clusters::Cleanup::ProjectNamespaceService.new(cluster, execution_count).execute
end
end
end
end
......
......@@ -3,13 +3,14 @@
module Clusters
module Cleanup
class ServiceAccountWorker
include ApplicationWorker
include ClusterQueue
include ClusterApplications
include ClusterCleanupMethods
# TODO: Merge with https://gitlab.com/gitlab-org/gitlab/merge_requests/16954
# We're splitting the above MR in smaller chunks to facilitate reviews
def perform
def perform(cluster_id)
Clusters::Cluster.find_by_id(cluster_id).try do |cluster|
break unless cluster.cleanup_removing_service_account?
Clusters::Cleanup::ServiceAccountService.new(cluster).execute
end
end
end
end
......
# frozen_string_literal: true
# Concern for setting Sidekiq settings for the various GitLab ObjectStorage workers.
module ClusterCleanupMethods
extend ActiveSupport::Concern
include ApplicationWorker
include ClusterQueue
DEFAULT_EXECUTION_LIMIT = 10
ExceededExecutionLimitError = Class.new(StandardError)
included do
worker_has_external_dependencies!
sidekiq_options retry: 3
sidekiq_retries_exhausted do |msg, error|
cluster_id = msg['args'][0]
cluster = Clusters::Cluster.find_by_id(cluster_id)
cluster.make_cleanup_errored!("#{self.class.name} retried too many times") if cluster
logger = Gitlab::Kubernetes::Logger.build
logger.error({
exception: error,
cluster_id: cluster_id,
class_name: msg['class'],
event: :sidekiq_retries_exhausted,
message: msg['error_message']
})
end
end
private
# Override this method to customize the execution_limit
def execution_limit
DEFAULT_EXECUTION_LIMIT
end
def exceeded_execution_limit?(execution_count)
execution_count >= execution_limit
end
def logger
@logger ||= Gitlab::Kubernetes::Logger.build
end
def exceeded_execution_limit(cluster)
log_exceeded_execution_limit_error(cluster)
cluster.make_cleanup_errored!("#{self.class.name} exceeded the execution limit")
end
def cluster_applications_and_status(cluster)
cluster.persisted_applications
.map { |application| "#{application.name}:#{application.status_name}" }
.join(",")
end
def log_exceeded_execution_limit_error(cluster)
logger.error({
exception: ExceededExecutionLimitError.name,
cluster_id: cluster.id,
class_name: self.class.name,
cleanup_status: cluster.cleanup_status_name,
applications: cluster_applications_and_status(cluster),
event: :failed_to_remove_cluster_and_resources,
message: "exceeded execution limit of #{execution_limit} tries"
})
end
end
---
title: Delete kubernetes cluster association and resources
merge_request: 16954
author:
type: added
......@@ -90,6 +90,28 @@ FactoryBot.define do
domain { 'example.com' }
end
trait :with_environments do
transient do
environments { %i(staging production) }
end
cluster_type { Clusters::Cluster.cluster_types[:project_type] }
before(:create) do |cluster, evaluator|
cluster_project = create(:cluster_project, cluster: cluster)
evaluator.environments.each do |env_name|
environment = create(:environment, name: env_name, project: cluster_project.project)
cluster.kubernetes_namespaces << create(:cluster_kubernetes_namespace,
cluster: cluster,
cluster_project: cluster_project,
project: cluster_project.project,
environment: environment)
end
end
end
trait :not_managed do
managed { false }
end
......
......@@ -500,6 +500,48 @@ describe Clusters::Cluster, :use_clean_rails_memory_store_caching do
end
end
describe '.with_persisted_applications' do
let(:cluster) { create(:cluster) }
let!(:helm) { create(:clusters_applications_helm, :installed, cluster: cluster) }
it 'preloads persisted applications' do
query_rec = ActiveRecord::QueryRecorder.new do
described_class.with_persisted_applications.find_by_id(cluster.id).application_helm
end
expect(query_rec.count).to eq(1)
end
end
describe '#persisted_applications' do
let(:cluster) { create(:cluster) }
subject { cluster.persisted_applications }
context 'when all applications are created' do
let!(:helm) { create(:clusters_applications_helm, cluster: cluster) }
let!(:ingress) { create(:clusters_applications_ingress, cluster: cluster) }
let!(:cert_manager) { create(:clusters_applications_cert_manager, cluster: cluster) }
let!(:prometheus) { create(:clusters_applications_prometheus, cluster: cluster) }
let!(:runner) { create(:clusters_applications_runner, cluster: cluster) }
let!(:jupyter) { create(:clusters_applications_jupyter, cluster: cluster) }
let!(:knative) { create(:clusters_applications_knative, cluster: cluster) }
it 'returns a list of created applications' do
is_expected.to contain_exactly(helm, ingress, cert_manager, prometheus, runner, jupyter, knative)
end
end
context 'when not all were created' do
let!(:helm) { create(:clusters_applications_helm, cluster: cluster) }
let!(:ingress) { create(:clusters_applications_ingress, cluster: cluster) }
it 'returns a list of created applications' do
is_expected.to contain_exactly(helm, ingress)
end
end
end
describe '#applications' do
set(:cluster) { create(:cluster) }
......
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Cleanup::AppService do
describe '#execute' do
let!(:cluster) { create(:cluster, :project, :cleanup_uninstalling_applications, provider_type: :gcp) }
let(:service) { described_class.new(cluster) }
let(:logger) { service.send(:logger) }
let(:log_meta) do
{
service: described_class.name,
cluster_id: cluster.id,
execution_count: 0
}
end
subject { service.execute }
shared_examples 'does not reschedule itself' do
it 'does not reschedule itself' do
expect(Clusters::Cleanup::AppWorker).not_to receive(:perform_in)
end
end
context 'when cluster has no applications available or transitioning applications' do
it_behaves_like 'does not reschedule itself'
it 'transitions cluster to cleanup_removing_project_namespaces' do
expect { subject }
.to change { cluster.reload.cleanup_status_name }
.from(:cleanup_uninstalling_applications)
.to(:cleanup_removing_project_namespaces)
end
it 'schedules Clusters::Cleanup::ProjectNamespaceWorker' do
expect(Clusters::Cleanup::ProjectNamespaceWorker).to receive(:perform_async).with(cluster.id)
subject
end
it 'logs all events' do
expect(logger).to receive(:info)
.with(log_meta.merge(event: :schedule_remove_project_namespaces))
subject
end
end
context 'when cluster has uninstallable applications' do
shared_examples 'reschedules itself' do
it 'reschedules itself' do
expect(Clusters::Cleanup::AppWorker)
.to receive(:perform_in)
.with(1.minute, cluster.id, 1)
subject
end
end
context 'has applications with dependencies' do
let!(:helm) { create(:clusters_applications_helm, :installed, cluster: cluster) }
let!(:ingress) { create(:clusters_applications_ingress, :installed, cluster: cluster) }
let!(:cert_manager) { create(:clusters_applications_cert_manager, :installed, cluster: cluster) }
let!(:jupyter) { create(:clusters_applications_jupyter, :installed, cluster: cluster) }
it_behaves_like 'reschedules itself'
it 'only uninstalls apps that are not dependencies for other installed apps' do
expect(Clusters::Applications::UninstallWorker)
.not_to receive(:perform_async).with(helm.name, helm.id)
expect(Clusters::Applications::UninstallWorker)
.not_to receive(:perform_async).with(ingress.name, ingress.id)
expect(Clusters::Applications::UninstallWorker)
.to receive(:perform_async).with(cert_manager.name, cert_manager.id)
.and_call_original
expect(Clusters::Applications::UninstallWorker)
.to receive(:perform_async).with(jupyter.name, jupyter.id)
.and_call_original
subject
end
it 'logs application uninstalls and next execution' do
expect(logger).to receive(:info)
.with(log_meta.merge(event: :uninstalling_app, application: kind_of(String))).exactly(2).times
expect(logger).to receive(:info)
.with(log_meta.merge(event: :scheduling_execution, next_execution: 1))
subject
end
context 'cluster is not cleanup_uninstalling_applications' do
let!(:cluster) { create(:cluster, :project, provider_type: :gcp) }
it_behaves_like 'does not reschedule itself'
end
end
context 'when applications are still uninstalling/scheduled/depending on others' do
let!(:helm) { create(:clusters_applications_helm, :installed, cluster: cluster) }
let!(:ingress) { create(:clusters_applications_ingress, :scheduled, cluster: cluster) }
let!(:runner) { create(:clusters_applications_runner, :uninstalling, cluster: cluster) }
it_behaves_like 'reschedules itself'
it 'does not call the uninstallation service' do
expect(Clusters::Applications::UninstallWorker).not_to receive(:new)
subject
end
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Cleanup::ProjectNamespaceService do
describe '#execute' do
subject { service.execute }
let!(:service) { described_class.new(cluster) }
let!(:cluster) { create(:cluster, :with_environments, :cleanup_removing_project_namespaces) }
let!(:logger) { service.send(:logger) }
let(:log_meta) do
{
service: described_class.name,
cluster_id: cluster.id,
execution_count: 0
}
end
let(:kubeclient_instance_double) do
instance_double(Gitlab::Kubernetes::KubeClient, delete_namespace: nil, delete_service_account: nil)
end
before do
allow_any_instance_of(Clusters::Cluster).to receive(:kubeclient).and_return(kubeclient_instance_double)
end
context 'when cluster has namespaces to be deleted' do
it 'deletes namespaces from cluster' do
expect(kubeclient_instance_double).to receive(:delete_namespace)
.with cluster.kubernetes_namespaces[0].namespace
expect(kubeclient_instance_double).to receive(:delete_namespace)
.with(cluster.kubernetes_namespaces[1].namespace)
subject
end
it 'deletes namespaces from database' do
expect { subject }.to change { cluster.kubernetes_namespaces.exists? }.from(true).to(false)
end
it 'schedules ::ServiceAccountWorker' do
expect(Clusters::Cleanup::ServiceAccountWorker).to receive(:perform_async).with(cluster.id)
subject
end
it 'logs all events' do
expect(logger).to receive(:info)
.with(
log_meta.merge(
event: :deleting_project_namespace,
namespace: cluster.kubernetes_namespaces[0].namespace))
expect(logger).to receive(:info)
.with(
log_meta.merge(
event: :deleting_project_namespace,
namespace: cluster.kubernetes_namespaces[1].namespace))
subject
end
end
context 'when cluster has no namespaces' do
let!(:cluster) { create(:cluster, :cleanup_removing_project_namespaces) }
it 'schedules Clusters::Cleanup::ServiceAccountWorker' do
expect(Clusters::Cleanup::ServiceAccountWorker).to receive(:perform_async).with(cluster.id)
subject
end
it 'transitions to cleanup_removing_service_account' do
expect { subject }
.to change { cluster.reload.cleanup_status_name }
.from(:cleanup_removing_project_namespaces)
.to(:cleanup_removing_service_account)
end
it 'does not try to delete namespaces' do
expect(kubeclient_instance_double).not_to receive(:delete_namespace)
subject
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Cleanup::ServiceAccountService do
describe '#execute' do
subject { service.execute }
let!(:service) { described_class.new(cluster) }
let!(:cluster) { create(:cluster, :cleanup_removing_service_account) }
let!(:logger) { service.send(:logger) }
let(:log_meta) do
{
service: described_class.name,
cluster_id: cluster.id,
execution_count: 0
}
end
let(:kubeclient_instance_double) do
instance_double(Gitlab::Kubernetes::KubeClient, delete_namespace: nil, delete_service_account: nil)
end
before do
allow_any_instance_of(Clusters::Cluster).to receive(:kubeclient).and_return(kubeclient_instance_double)
end
it 'deletes gitlab service account' do
expect(kubeclient_instance_double).to receive(:delete_service_account)
.with(
::Clusters::Kubernetes::GITLAB_SERVICE_ACCOUNT_NAME,
::Clusters::Kubernetes::GITLAB_SERVICE_ACCOUNT_NAMESPACE)
subject
end
it 'logs all events' do
expect(logger).to receive(:info).with(log_meta.merge(event: :deleting_gitlab_service_account))
expect(logger).to receive(:info).with(log_meta.merge(event: :destroying_cluster))
subject
end
it 'deletes cluster' do
expect { subject }.to change { Clusters::Cluster.where(id: cluster.id).exists? }.from(true).to(false)
end
end
end
......@@ -45,7 +45,7 @@ describe Clusters::DestroyService do
expect(Clusters::Cluster.where(id: cluster.id).exists?).not_to be_falsey
end
it 'transition cluster#cleanup_status from cleanup_not_started to uninstalling_applications' do
it 'transition cluster#cleanup_status from cleanup_not_started to cleanup_uninstalling_applications' do
expect { subject }.to change { cluster.cleanup_status_name }
.from(:cleanup_not_started)
.to(:cleanup_uninstalling_applications)
......
# frozen_string_literal: true
shared_examples 'cluster cleanup worker base specs' do
it 'transitions to errored if sidekiq retries exhausted' do
job = { 'args' => [cluster.id, 0], 'jid' => '123' }
described_class.sidekiq_retries_exhausted_block.call(job)
expect(cluster.reload.cleanup_status_name).to eq(:cleanup_errored)
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Cleanup::AppWorker do
describe '#perform' do
subject { worker_instance.perform(cluster.id) }
let!(:worker_instance) { described_class.new }
let!(:cluster) { create(:cluster, :project, :cleanup_uninstalling_applications, provider_type: :gcp) }
let!(:logger) { worker_instance.send(:logger) }
it_behaves_like 'cluster cleanup worker base specs'
context 'when exceeded the execution limit' do
subject { worker_instance.perform(cluster.id, worker_instance.send(:execution_limit)) }
let(:worker_instance) { described_class.new }
let(:logger) { worker_instance.send(:logger) }
let!(:helm) { create(:clusters_applications_helm, :installed, cluster: cluster) }
let!(:ingress) { create(:clusters_applications_ingress, :scheduled, cluster: cluster) }
it 'logs the error' do
expect(logger).to receive(:error)
.with(
hash_including(
exception: 'ClusterCleanupMethods::ExceededExecutionLimitError',
cluster_id: kind_of(Integer),
class_name: described_class.name,
applications: "helm:installed,ingress:scheduled",
cleanup_status: cluster.cleanup_status_name,
event: :failed_to_remove_cluster_and_resources,
message: "exceeded execution limit of 10 tries"
)
)
subject
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Cleanup::ProjectNamespaceWorker do
describe '#perform' do
context 'when cluster.cleanup_status is cleanup_removing_project_namespaces' do
let!(:cluster) { create(:cluster, :with_environments, :cleanup_removing_project_namespaces) }
let!(:worker_instance) { described_class.new }
let!(:logger) { worker_instance.send(:logger) }
it_behaves_like 'cluster cleanup worker base specs'
it 'calls Clusters::Cleanup::ProjectNamespaceService' do
expect_any_instance_of(Clusters::Cleanup::ProjectNamespaceService).to receive(:execute).once
subject.perform(cluster.id)
end
context 'when exceeded the execution limit' do
subject { worker_instance.perform(cluster.id, worker_instance.send(:execution_limit))}
it 'logs the error' do
expect(logger).to receive(:error)
.with(
hash_including(
exception: 'ClusterCleanupMethods::ExceededExecutionLimitError',
cluster_id: kind_of(Integer),
class_name: described_class.name,
applications: "",
cleanup_status: cluster.cleanup_status_name,
event: :failed_to_remove_cluster_and_resources,
message: "exceeded execution limit of 10 tries"
)
)
subject
end
end
end
context 'when cluster.cleanup_status is not cleanup_removing_project_namespaces' do
let!(:cluster) { create(:cluster, :with_environments) }
it 'does not call Clusters::Cleanup::ProjectNamespaceService' do
expect(Clusters::Cleanup::ProjectNamespaceService).not_to receive(:new)
subject.perform(cluster.id)
end
end
end
end
# frozen_string_literal: true
require 'spec_helper'
describe Clusters::Cleanup::ServiceAccountWorker do
describe '#perform' do
let!(:cluster) { create(:cluster, :cleanup_removing_service_account) }
context 'when cluster.cleanup_status is cleanup_removing_service_account' do
it 'calls Clusters::Cleanup::ServiceAccountService' do
expect_any_instance_of(Clusters::Cleanup::ServiceAccountService).to receive(:execute).once
subject.perform(cluster.id)
end
end
context 'when cluster.cleanup_status is not cleanup_removing_service_account' do
let!(:cluster) { create(:cluster, :with_environments) }
it 'does not call Clusters::Cleanup::ServiceAccountService' do
expect(Clusters::Cleanup::ServiceAccountService).not_to receive(:new)
subject.perform(cluster.id)
end
end
end
end
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment