Commit 4d3a3731 authored by Rafael Monnerat's avatar Rafael Monnerat

slapos_crm: Reimplement check Instance Tree check State

    Merge into Project_checkMonitoring for launch per-project basis.
parent 5c2bbac9
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="Alarm" module="erp5.portal_type"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>active_sense_method_id</string> </key>
<value> <string>Alarm_checkInstanceTreeState</string> </value>
</item>
<item>
<key> <string>automatic_solve</string> </key>
<value> <int>0</int> </value>
</item>
<item>
<key> <string>description</string> </key>
<value> <string>Check and create a Ticket when an instance is partially allocated for more than 4 hours.</string> </value>
</item>
<item>
<key> <string>enabled</string> </key>
<value> <int>1</int> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>slapos_crm_check_instance_in_error</string> </value>
</item>
<item>
<key> <string>periodicity_hour</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>periodicity_hour_frequency</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>periodicity_minute</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>periodicity_minute_frequency</string> </key>
<value> <int>5</int> </value>
</item>
<item>
<key> <string>periodicity_month</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>periodicity_month_day</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>periodicity_start_date</string> </key>
<value>
<object>
<klass>
<global name="_reconstructor" module="copy_reg"/>
</klass>
<tuple>
<global name="DateTime" module="DateTime.DateTime"/>
<global name="object" module="__builtin__"/>
<none/>
</tuple>
<state>
<tuple>
<float>1406073600.0</float>
<string>GMT</string>
</tuple>
</state>
</object>
</value>
</item>
<item>
<key> <string>periodicity_week</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Alarm</string> </value>
</item>
<item>
<key> <string>sense_method_id</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Check partially allocated Instance for more than 4 hours</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
portal = context.getPortalObject()
portal.portal_catalog.searchAndActivate(
portal_type='Instance Tree',
validation_state='validated',
method_id='InstanceTree_checkSoftwareInstanceState',
# This alarm bruteforce checking all documents,
# without changing them directly.
# Increase priority to not block other activities
activate_kw = {'tag':tag, 'priority': 2}
)
context.activate(after_tag=tag).getId()
portal = context.getPortalObject()
monitor_enabled_category = portal.restrictedTraverse(
"portal_categories/monitor_scope/enabled", None)
portal = context.getPortalObject()
portal.portal_catalog.searchAndActivate(
portal_type='Project',
portal_type='Compute Node',
validation_state='validated',
method_id='Project_checkMonitoringState',
activate_kw={'tag': tag}
method_id='ComputeNode_checkProjectMontoringState',
monitor_scope__uid=monitor_enabled_category.getUid(),
group_by=['follow_up_uid'],
method_kw={'tag': tag},
activate_kw={'tag': tag, 'priority': 2}
)
context.activate(after_tag=tag).getId()
......@@ -2,19 +2,37 @@ portal = context.getPortalObject()
monitor_enabled_category = portal.restrictedTraverse(
"portal_categories/monitor_scope/enabled", None)
if context.Project_isSupportRequestCreationClosed():
project = context.getFollowUpValue(portal_type='Project')
assert project is not None
if project.Project_isSupportRequestCreationClosed():
return
if monitor_enabled_category is not None:
project_uid = project.getUid()
portal.portal_catalog.searchAndActivate(
portal_type='Compute Node',
validation_state='validated',
monitor_scope__uid=monitor_enabled_category.getUid(),
follow_up__uid=context.getUid(),
follow_up__uid=project_uid,
method_id='ComputeNode_checkMonitoringState',
# This alarm bruteforce checking all documents,
# without changing them directly.
# Increase priority to not block other activities
activate_kw={'tag':tag, 'priority': 2}
activate_kw={'tag': tag, 'priority': 2}
)
context.activate(after_tag=tag).getId()
portal.portal_catalog.searchAndActivate(
# Slave is required due unallocated use case
portal_type=['Software Instance', 'Slave Instance'],
validation_state='validated',
follow_up__uid=project_uid,
group_by=['specialise_uid'],
method_id='SoftwareInstance_checkInstanceTreeMonitoringState',
# This alarm bruteforce checking all documents,
# without changing them directly.
# Increase priority to not block other activities
activate_kw={'tag': tag, 'priority': 2}
)
project.activate(after_tag=tag).getId()
......@@ -54,7 +54,7 @@
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Project_checkMontoringState</string> </value>
<value> <string>ComputeNode_checkProjectMontoringState</string> </value>
</item>
</dictionary>
</pickle>
......
......@@ -17,7 +17,6 @@ error_dict = {
'issue_document_reference': None
}
if compute_node.getMonitorScope() == "disabled":
for i in ['ticket_title', 'ticket_description', 'last_contact']:
error_dict[i] = "Monitor is disabled on this Compute Node."
......
from DateTime import DateTime
from erp5.component.module.DateUtils import addToDate
instance_tree = context
portal = context.getPortalObject()
project = context.getFollowUpValue()
if project.Project_isSupportRequestCreationClosed():
return
date_check_limit = addToDate(DateTime(), to_add={'hour': -1})
if (date_check_limit - instance_tree.getCreationDate()) < 0:
# Too early to check
return
software_instance_list = context.portal_catalog(
portal_type=["Software Instance", "Slave Instance"],
specialise__uid=instance_tree.getUid(),
**{"slapos_item.slap_state": ["start_requested"]})
# Check if at least one software Instance is Allocated
notification_message_reference = None
for instance in software_instance_list:
if (date_check_limit - instance.getCreationDate()) < 0:
continue
if instance.getSlapState() != "start_requested":
continue
compute_partition = instance.getAggregateValue()
if compute_partition is None:
notification_message_reference = 'slapos-crm-instance-tree-instance-allocation.notification'
elif (instance.getPortalType() == "Software Instance") and \
(compute_partition.getParentValue().getPortalType() == "Compute Node") and \
(compute_partition.getParentValue().getMonitorScope() == "enabled") and \
instance.SoftwareInstance_hasReportedError(tolerance=30):
notification_message_reference = 'slapos-crm-instance-tree-instance-state.notification'
if notification_message_reference is not None:
ticket_title = "Instance Tree %s is failing." % context.getTitle()
error_message = instance.SoftwareInstance_hasReportedError(include_message=True)
description = "%s contains software instances which are unallocated or reporting errors." % (
context.getTitle())
if error_message:
description += "\n\nMessage: %s" % str(error_message)
else:
error_message = "No message!"
support_request = project.Project_createSupportRequestWithCausality(
ticket_title,
description,
causality=context.getRelativeUrl(),
destination_decision=context.getDestinationSection()
)
if support_request is None:
return
support_request.Ticket_createProjectEvent(
ticket_title, 'outgoing', 'Web Message',
portal.service_module.slapos_crm_information.getRelativeUrl(),
text_content=description,
content_type='text/plain',
notification_message=notification_message_reference,
#language=XXX,
substitution_method_parameter_dict={
'instance_tree_title':context.getTitle(),
'instance': instance.getTitle(),
'error_text': error_message
}
)
return
from DateTime import DateTime
from erp5.component.module.DateUtils import addToDate
portal = context.getPortalObject()
assert context.getPortalType() in ['Software Instance', 'Slave Instance']
instance_tree = context.getSpecialiseValue(portal_type="Instance Tree")
assert instance_tree is not None
project = instance_tree.getFollowUpValue()
if project.Project_isSupportRequestCreationClosed():
return
date_check_limit = addToDate(DateTime(), to_add={'hour': -1})
if (date_check_limit - instance_tree.getCreationDate()) < 0:
# Too early to check
return
software_instance_list = portal.portal_catalog(
portal_type=["Software Instance", "Slave Instance"],
specialise__uid=instance_tree.getUid(),
**{"slapos_item.slap_state": ["start_requested"]})
# Check if at least one software Instance is Allocated
for instance in software_instance_list:
if (date_check_limit - instance.getCreationDate()) < 0:
continue
error_dict = instance.SoftwareInstance_getReportedErrorDict(tolerance=30)
if error_dict['should_notify']:
support_request = project.Project_createSupportRequestWithCausality(
error_dict['ticket_title'],
error_dict['ticket_description'],
causality=instance_tree.getRelativeUrl(),
destination_decision=instance_tree.getDestinationSection()
)
if support_request is not None:
support_request.Ticket_createProjectEvent(
error_dict['ticket_title'], 'outgoing', 'Web Message',
portal.service_module.slapos_crm_information.getRelativeUrl(),
text_content=error_dict['ticket_description'],
content_type='text/plain',
notification_message=error_dict['notification_message_reference'],
#language=XXX,
substitution_method_parameter_dict=error_dict
)
return support_request
......@@ -54,7 +54,7 @@
</item>
<item>
<key> <string>id</string> </key>
<value> <string>InstanceTree_checkSoftwareInstanceState</string> </value>
<value> <string>SoftwareInstance_checkInstanceTreeMonitoringState</string> </value>
</item>
</dictionary>
</pickle>
......
from DateTime import DateTime
error_dict = {
'should_notify': None,
'ticket_title': None,
'ticket_description': None,
'instance_tree_title':context.getSpecialiseTitle(),
'instance': context.getTitle(),
'notification_message_reference': None,
'last_contact': None,
'since': None,
'error_text': None,
'message': None
}
# Nothing to do
if context.getSlapState() != "start_requested":
return error_dict
def updateErrorDictWithError(_error_dict):
_error_dict['should_notify'] = True
_error_dict['ticket_title'] = "Instance Tree %s is failing." % _error_dict['instance_tree_title']
return _error_dict
compute_partition = context.getAggregateValue(portal_type="Compute Partition")
if compute_partition is None:
error_dict['notification_message_reference'] = 'slapos-crm-instance-tree-instance-allocation.notification'
error_dict['message'] = "%s is not allocated." % context.getTitle()
error_dict['ticket_description'] = error_dict['message']
return updateErrorDictWithError(error_dict)
compute_node = compute_partition.getParentValue()
if compute_node.getPortalType() == "Compute Node" and \
compute_node.getAllocationScope() == 'close/forever':
# Closed compute_nodes like this might contains unremoved instances hanging there
error_dict['notification_message_reference'] = 'slapos-crm-instance-tree-instance-on-close-computer.notification'
error_dict = updateErrorDictWithError(error_dict)
error_dict['message'] = "%s is allocated on a Compute node that is closed forever." % context.getTitle()
error_dict['ticket_description'] = error_dict['message']
return error_dict
if context.getPortalType() == 'Slave Instance':
# We skip if the the slave is already allocated.
return error_dict
# Skip to check if monitor disabled on the compute node.
# Remote node has no state.
if compute_node.getPortalType() != "Compute Node":
portal_type = compute_partition.getParentValue().getPortalType()
error_dict['ticket_title'] = "Instance is allocated on a %s" % portal_type
error_dict['ticket_description'] = error_dict['ticket_title']
return error_dict
if compute_partition.getParentValue().getMonitorScope() != "enabled":
error_dict['ticket_title'] = "Monitor is disabled on the Compute Node"
error_dict['ticket_description'] = error_dict['ticket_title']
return error_dict
d = context.getAccessStatus()
# Ignore if data isn't present.
if d.get("no_data", None) == 1:
error_dict['ticket_title'] = "Not possible to connect"
error_dict['ticket_description'] = "Not possible to connect"
return error_dict
error_dict['error_text'] = d['text']
error_dict['last_contact'] = DateTime(d.get('created_at'))
error_dict['since'] = DateTime(d.get('since'))
if error_dict['error_text'].startswith('#error '):
if ((DateTime()-error_dict['since'])*24*60) > tolerance:
error_dict['notification_message_reference'] = 'slapos-crm-instance-tree-instance-state.notification'
description = "%s is reporting errors. \n\nMessage: %s" % (context.getTitle(), str(error_dict['error_text']))
error_dict['ticket_description'] = description
# Longer form for consistency.
error_dict['message'] = "%s has error (%s, %s at %s)" % (
context.getReference(), context.getTitle(), context.getUrlString(), compute_node.getReference())
return updateErrorDictWithError(error_dict)
return error_dict
......@@ -50,11 +50,11 @@
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>tag, fixit, params</string> </value>
<value> <string>tolerance=0</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Alarm_checkInstanceTreeState</string> </value>
<value> <string>SoftwareInstance_getReportedErrorDict</string> </value>
</item>
</dictionary>
</pickle>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment