Commit 015add27 authored by Rafael Monnerat's avatar Rafael Monnerat

slapos_crm: Check if the slapgrid is staled whenever check if the computer is down

See merge request nexedi/slapos.core!468
parents 024787b4 5348fed0
Pipeline #25495 failed with stage
in 0 seconds
......@@ -13,58 +13,133 @@ if context.getAllocationScope("open").startswith("close"):
reference = context.getReference()
compute_node_title = context.getTitle()
ticket_title = "[MONITORING] Lost contact with compute_node %s" % reference
node_ticket_title = "[MONITORING] Lost contact with compute_node %s" % reference
instance_ticket_title = "[MONITORING] Compute Node %s has a stalled instance process" % reference
software_ticket_title = "[MONITORING] Compute Node %s has a stalled software process" % reference
ticket_title = node_ticket_title
description = ""
last_contact = "No Contact Information"
notification_message_reference = 'slapos-crm-compute_node_check_state.notification'
now = DateTime()
d = context.getAccessStatus()
# Ignore if data isn't present.
should_notify = False
if d.get("no_data") == 1:
should_notify = True
description = "The Compute Node %s (%s) has not contacted the server (No Contact Information)" % (
compute_node_title, reference)
else:
last_contact = DateTime(d.get('created_at'))
if (DateTime() - last_contact) > 0.01:
if (now - last_contact) > 0.01:
should_notify = True
description = "The Compute Node %s (%s) has not contacted the server for more than 30 minutes" \
"(last contact date: %s)" % (compute_node_title, reference, last_contact)
else:
# Nothing to notify.
return
support_request = person.Base_getSupportRequestInProgress(
title=ticket_title,
aggregate=context.getRelativeUrl())
if not should_notify:
# Since server is contacting, check for stalled processes
ticket_title = instance_ticket_title
notification_message_reference = 'slapos-crm-compute_node_check_stalled_instance_state.notification'
last_contact = "No Contact Information"
if support_request is None:
person.notify(support_request_title=ticket_title,
support_request_description=description,
aggregate=context.getRelativeUrl())
# If server has no partitions skip
compute_partition_uid_list = [
x.getUid() for x in context.contentValues(portal_type="Compute Partition")
if x.getSlapState() == 'busy']
if compute_partition_uid_list:
instance_list = portal.portal_catalog(
portal_type='Software Instance',
default_aggregate_uid=compute_partition_uid_list)
if instance_list:
should_notify = True
description = "The Compute Node %s (%s) didnt process its instances for more them 24 hours" % (
compute_node_title, reference)
for instance in instance_list:
instance_access_status = instance.getAccessStatus()
if instance_access_status.get('no_data', None):
# Ignore if there isnt any data
continue
# At lest one partition contacted in the last 24h30min.
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(instance_access_status.get('created_at'))) < 1.01:
should_notify = False
description = ""
break
support_request_relative_url = context.REQUEST.get("support_request_relative_url")
if support_request_relative_url is None:
return
if not should_notify:
ticket_title = software_ticket_title
notification_message_reference = 'slapos-crm-compute_node_check_stalled_software_state.notification'
last_contact = "No Contact Information"
support_request = portal.restrictedTraverse(support_request_relative_url)
# Since server is contacting, check for stalled software releases processes
software_installation_list = portal.portal_catalog(
portal_type='Software Installation',
default_aggregate_uid=context.getUid(),
validation_state='validated')
if support_request is None:
return
if software_installation_list:
should_notify = True
description = "The Compute Node %s (%s) didnt process its software releases for more them 24 hours" % (
compute_node_title, reference)
# Test if server didnt process the internal softwares releases for more them 24h
for installation in software_installation_list:
installation_access_status = installation.getAccessStatus()
if installation_access_status.get('no_data', None):
# Ignore if there isnt any data on it
continue
last_contact = max(DateTime(instance_access_status.get('created_at')), last_contact)
if (now - DateTime(installation_access_status.get('created_at'))) < 1.01:
should_notify = False
description = ""
break
# Send Notification message
notification_message = portal.portal_notifications.getDocumentValue(
reference='slapos-crm-compute_node_check_state.notification')
if should_notify:
support_request = person.Base_getSupportRequestInProgress(
title=node_ticket_title,
aggregate=context.getRelativeUrl())
if notification_message is None:
message = """%s""" % description
else:
mapping_dict = {'compute_node_title':context.getTitle(),
'compute_node_id':reference,
'last_contact':last_contact}
message = notification_message.asText(
substitution_method_parameter_dict={'mapping_dict': mapping_dict})
if support_request is None:
support_request = person.Base_getSupportRequestInProgress(
title=ticket_title,
aggregate=context.getRelativeUrl())
event = support_request.SupportRequest_getLastEvent(ticket_title)
if event is None:
support_request.notify(message_title=ticket_title, message=message)
if support_request is None:
person.notify(support_request_title=ticket_title,
support_request_description=description,
aggregate=context.getRelativeUrl())
support_request_relative_url = context.REQUEST.get("support_request_relative_url")
if support_request_relative_url is None:
return
return support_request
support_request = portal.restrictedTraverse(support_request_relative_url)
if support_request is None:
return
# Send Notification message
notification_message = portal.portal_notifications.getDocumentValue(
reference=notification_message_reference)
if notification_message is None:
message = """%s""" % description
else:
mapping_dict = {'compute_node_title':context.getTitle(),
'compute_node_id':reference,
'last_contact':last_contact}
message = notification_message.asText(
substitution_method_parameter_dict={'mapping_dict': mapping_dict})
event = support_request.SupportRequest_getLastEvent(ticket_title)
if event is None:
support_request.notify(message_title=ticket_title, message=message)
return support_request
......@@ -1113,6 +1113,128 @@ class TestSlapOSComputeNode_CheckState(TestCRMSkinsMixin):
self.assertEqual(event.getDestination(), ticket.getSourceSection())
self.assertEqual(event.getSource(), person.getRelativeUrl())
@simulate('ERP5Site_isSupportRequestCreationClosed', '*args, **kwargs','return 0')
@simulate('NotificationTool_getDocumentValue',
'reference=None',
'assert reference == "slapos-crm-compute_node_check_stalled_instance_state.notification", reference\n' \
'return context.restrictedTraverse(' \
'context.REQUEST["test_ComputeNode_checkState_stalled_instance"])')
def test_ComputeNode_checkState_stalled_instance(self):
compute_node = self._makeComputeNode(owner=self.makePerson(user=0))[0]
self._makeComplexComputeNode()
person = compute_node.getSourceAdministrationValue()
self.portal.REQUEST['test_ComputeNode_checkState_stalled_instance'] = \
self._makeNotificationMessage(compute_node.getReference())
# Computer is getting access
compute_node.setAccessStatus("")
try:
self.pinDateTime(DateTime()-1.1)
self.start_requested_software_instance.setAccessStatus("")
finally:
self.unpinDateTime()
compute_node.ComputeNode_checkState()
self.tic()
ticket_title = "[MONITORING] Compute Node %s has a stalled instance process" % compute_node.getReference()
ticket = self._getGeneratedSupportRequest(compute_node.getUid(), ticket_title)
self.assertNotEqual(ticket, None)
event_list = ticket.getFollowUpRelatedValueList()
self.assertEqual(len(event_list), 1)
event = event_list[0]
self.assertEqual(event.getTitle(), ticket.getTitle())
self.assertIn(compute_node.getReference(), event.getTextContent())
self.assertEqual(event.getDestination(), ticket.getSourceSection())
self.assertEqual(event.getSource(), person.getRelativeUrl())
@simulate('ERP5Site_isSupportRequestCreationClosed', '*args, **kwargs','return 0')
@simulate('NotificationTool_getDocumentValue',
'reference=None',
'assert reference == "slapos-crm-compute_node_check_stalled_software_state.notification", reference\n' \
'return context.restrictedTraverse(' \
'context.REQUEST["test_ComputeNode_checkState_stalled_software"])')
def test_ComputeNode_checkState_stalled_software(self):
compute_node = self._makeComputeNode(owner=self.makePerson(user=0))[0]
self._makeComplexComputeNode()
person = compute_node.getSourceAdministrationValue()
self.portal.REQUEST['test_ComputeNode_checkState_stalled_software'] = \
self._makeNotificationMessage(compute_node.getReference())
# Computer is getting access, also internal instance
compute_node.setAccessStatus("")
self.start_requested_software_instance.setAccessStatus("")
try:
self.pinDateTime(DateTime()-1.1)
self.start_requested_software_installation.setAccessStatus("")
finally:
self.unpinDateTime()
compute_node.ComputeNode_checkState()
self.tic()
ticket_title = "[MONITORING] Compute Node %s has a stalled software process" % compute_node.getReference()
ticket = self._getGeneratedSupportRequest(compute_node.getUid(), ticket_title)
self.assertNotEqual(ticket, None)
event_list = ticket.getFollowUpRelatedValueList()
self.assertEqual(len(event_list), 1)
event = event_list[0]
self.assertEqual(event.getTitle(), ticket.getTitle())
self.assertIn(compute_node.getReference(), event.getTextContent())
self.assertEqual(event.getDestination(), ticket.getSourceSection())
self.assertEqual(event.getSource(), person.getRelativeUrl())
@simulate('ERP5Site_isSupportRequestCreationClosed', '*args, **kwargs','return 0')
@simulate('NotificationTool_getDocumentValue',
'reference=None',
'assert reference == "slapos-crm-compute_node_check_stalled_instance_state.notification", reference\n' \
'return context.restrictedTraverse(' \
'context.REQUEST["test_ComputeNode_checkState_stalled_instance"])')
def test_ComputeNode_checkState_stalled_instance_single(self):
compute_node = self._makeComputeNode(owner=self.makePerson(user=0))[0]
self._makeComplexComputeNode()
person = compute_node.getSourceAdministrationValue()
self.portal.REQUEST['test_ComputeNode_checkState_stalled_instance'] = \
self._makeNotificationMessage(compute_node.getReference())
# Computer is getting access
compute_node.setAccessStatus("")
try:
self.pinDateTime(DateTime()-1.1)
self.start_requested_software_instance.setAccessStatus("")
self.start_requested_software_installation.setAccessStatus("")
finally:
self.unpinDateTime()
compute_node.ComputeNode_checkState()
self.tic()
ticket_title = "[MONITORING] Compute Node %s has a stalled instance process" % compute_node.getReference()
ticket = self._getGeneratedSupportRequest(compute_node.getUid(), ticket_title)
self.assertNotEqual(ticket, None)
event_list = ticket.getFollowUpRelatedValueList()
self.assertEqual(len(event_list), 1)
event = event_list[0]
self.assertEqual(event.getTitle(), ticket.getTitle())
self.assertIn(compute_node.getReference(), event.getTextContent())
self.assertEqual(event.getDestination(), ticket.getSourceSection())
self.assertEqual(event.getSource(), person.getRelativeUrl())
class TestSlapOSInstanceTree_createSupportRequestEvent(SlapOSTestCaseMixin):
def _makeNotificationMessage(self, reference):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment