Commit 50d81df1 authored by Rafael Monnerat's avatar Rafael Monnerat

slapos_crm: Implement alarm to check Allocation Consistency

    Add a daily alarm to verify if all instance allocated have proper Allocation Supply configured and if they
    still respect the SLA.

    Extend SoftwareInstance_getReportedErrorDict to verify SLA, since, in general the context of the report should be the instance tree. The calculation is cheap to assert the SLA, and this script already has specific checks to inform the user about bad approaches.
parent b5b946a9
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="Alarm" module="erp5.portal_type"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>active_sense_method_id</string> </key>
<value> <string>Alarm_checkProjectAllocationConsistencyState</string> </value>
</item>
<item>
<key> <string>automatic_solve</string> </key>
<value> <int>0</int> </value>
</item>
<item>
<key> <string>description</string> </key>
<value> <string>Check per project and trigger activities to verify all compute nodes (monitored) per project.</string> </value>
</item>
<item>
<key> <string>enabled</string> </key>
<value> <int>1</int> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>slapos_crm_project_allocation_consistency</string> </value>
</item>
<item>
<key> <string>periodicity_hour</string> </key>
<value>
<tuple>
<int>2</int>
</tuple>
</value>
</item>
<item>
<key> <string>periodicity_hour_frequency</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>periodicity_minute</string> </key>
<value>
<tuple>
<int>2</int>
</tuple>
</value>
</item>
<item>
<key> <string>periodicity_minute_frequency</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>periodicity_month</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>periodicity_month_day</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>periodicity_start_date</string> </key>
<value>
<object>
<klass>
<global name="_reconstructor" module="copy_reg"/>
</klass>
<tuple>
<global name="DateTime" module="DateTime.DateTime"/>
<global name="object" module="__builtin__"/>
<none/>
</tuple>
<state>
<tuple>
<float>1288051200.0</float>
<string>GMT</string>
</tuple>
</state>
</object>
</value>
</item>
<item>
<key> <string>periodicity_week</string> </key>
<value>
<tuple/>
</value>
</item>
<item>
<key> <string>portal_type</string> </key>
<value> <string>Alarm</string> </value>
</item>
<item>
<key> <string>sense_method_id</string> </key>
<value>
<none/>
</value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string>Create tickets for Compute nodes that contains inconsistent instances</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
from Products.ZSQLCatalog.SQLCatalog import SimpleQuery, ComplexQuery
portal = context.getPortalObject()
monitor_enabled_category = portal.restrictedTraverse(
"portal_categories/monitor_scope/enabled", None)
portal = context.getPortalObject()
portal.portal_catalog.searchAndActivate(
validation_state='validated',
method_id='ComputeNode_checkProjectAllocationConsistencyState',
node=ComplexQuery(
SimpleQuery(portal_type='Remote Node'),
ComplexQuery(
SimpleQuery(portal_type='Compute Node'),
SimpleQuery(monitor_scope__uid=monitor_enabled_category.getUid()),
logical_operator='and'
),
logical_operator='or'
),
group_by=['follow_up_uid'],
method_kw={'tag': tag},
activate_kw={'tag': tag, 'priority': 2}
)
context.activate(after_tag=tag).getId()
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="_reconstructor" module="copy_reg"/>
</klass>
<tuple>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
<global name="object" module="__builtin__"/>
<none/>
</tuple>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>tag, fixit, params</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Alarm_checkProjectAllocationConsistencyState</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
from DateTime import DateTime
import six
portal = context.getPortalObject()
# Remote Node has no monitor scope.
if context.getPortalType() == "Compute Node" and \
context.getMonitorScope() == "disabled":
return
project = context.getFollowUpValue()
if project.Project_isSupportRequestCreationClosed():
return
# Exceptionally, we pre-check if the computer has a ticket already
# Since calculation is a bit expensive to "just try".
monitor_service_uid = portal.service_module.slapos_crm_monitoring.getUid()
ticket_portal_type = "Support Request"
if portal.portal_catalog.getResultValue(
portal_type=ticket_portal_type,
resource__uid=monitor_service_uid,
simulation_state=["validated", "submitted", "suspended"],
causality__uid=context.getUid(),
) is not None:
return
reference = context.getReference()
compute_node_title = context.getTitle()
# Use same dict from monitoring so we are consistent while writting
# Notification messages
error_dict = {
'should_notify': None,
'ticket_title': "%s has missing allocation supplies." % compute_node_title,
'ticket_description': None,
'notification_message_reference': 'slapos-crm-compute_node_check_allocation_supply_state.notification',
'compute_node_title': compute_node_title,
'compute_node_id': reference,
'last_contact': None,
'issue_document_reference': None,
'message': None,
'compute_node_error_dict': {}
}
# Since we would like a single ticket per compute node do all at once:
for compute_partition in context.contentValues(portal_type='Compute Partition'):
if compute_partition.getSlapState() == 'busy':
compute_partition_error_dict = compute_partition.ComputePartition_checkAllocationConsistencyState(
known_error_dict=error_dict['compute_node_error_dict'])
if compute_partition_error_dict:
error_dict['should_notify'] = True
error_dict['compute_node_error_dict'].update(compute_partition_error_dict)
if not error_dict['should_notify']:
return
message = """The following contains instances that has Software Releases/Types that are missing on this %s's Allocation Supply configuration:
""" % context.getPortalType()
# Sample compute_node_error_dict[software_release_url][software_type] = (instance, compute_partition)
for sofware_release_url in error_dict['compute_node_error_dict']:
message += """ * Software Release: %s
""" % sofware_release_url
for sofware_type, value_list in six.iteritems(error_dict['compute_node_error_dict'][sofware_release_url]):
message += """ * Software Type: %s (ie: %s on %s)
""" % (sofware_type, value_list[0].getTitle(), value_list[1].getReference())
error_dict['message'] = message
support_request = project.Project_createTicketWithCausality(
ticket_portal_type,
error_dict['ticket_title'],
error_dict['ticket_description'],
causality=context.getRelativeUrl(),
destination_decision=project.getDestination()
)
if support_request is not None:
support_request.Ticket_createProjectEvent(
error_dict['ticket_title'], 'outgoing', 'Web Message',
portal.service_module.slapos_crm_information.getRelativeUrl(),
text_content=error_dict['message'],
content_type='text/plain',
notification_message=error_dict['notification_message_reference'],
#language=XXX,
substitution_method_parameter_dict=error_dict
)
return support_request
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="_reconstructor" module="copy_reg"/>
</klass>
<tuple>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
<global name="object" module="__builtin__"/>
<none/>
</tuple>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string></string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>ComputeNode_checkAllocationConsistencyState</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
from Products.ZSQLCatalog.SQLCatalog import SimpleQuery, ComplexQuery
portal = context.getPortalObject()
monitor_enabled_category = portal.restrictedTraverse(
"portal_categories/monitor_scope/enabled", None)
project = context.getFollowUpValue(portal_type='Project')
assert project is not None
if project.Project_isSupportRequestCreationClosed():
return
if monitor_enabled_category is not None:
project_uid = project.getUid()
portal.portal_catalog.searchAndActivate(
node=ComplexQuery(
SimpleQuery(portal_type='Remote Node'),
ComplexQuery(
SimpleQuery(portal_type='Compute Node'),
SimpleQuery(monitor_scope__uid=monitor_enabled_category.getUid()),
logical_operator='and'
),
logical_operator='or'
),
validation_state='validated',
follow_up__uid=project_uid,
method_id='ComputeNode_checkAllocationConsistencyState',
# This alarm bruteforce checking all documents,
# without changing them directly.
# Increase priority to not block other activities
activate_kw={'tag': tag, 'priority': 2}
)
project.activate(after_tag=tag).getId()
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="_reconstructor" module="copy_reg"/>
</klass>
<tuple>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
<global name="object" module="__builtin__"/>
<none/>
</tuple>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>tag</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>ComputeNode_checkProjectAllocationConsistencyState</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
portal = context.getPortalObject()
compute_partition = context
error_dict = {}
if known_error_dict is None:
known_error_dict = {}
compute_node = compute_partition.getParentValue()
assert compute_node.getPortalType() in ['Compute Node', 'Remote Node']
instance_list = compute_partition.getAggregateRelatedValueList(portal_type=[
'Software Instance', 'Slave Instance'])
for instance in instance_list:
if instance.getValidationState() != 'validated' or \
instance.getSlapState() == 'destroy_requested':
# Outdated catalog or instance under garbage collection,
# we skip for now.
continue
instance_software_release_url = instance.getUrlString()
instance_software_type = instance.getSourceReference()
is_known = False
for _e_dict in [error_dict, known_error_dict]:
if instance_software_release_url in _e_dict:
if instance_software_type in _e_dict[instance_software_release_url]:
# Skip calculate same thing again?
is_known = True
break
# Value was already processed or discovered on some other partition
# so we skip
if is_known:
continue
# Now check allocation supply consistency
instance_tree = instance.getSpecialiseValue(portal_type="Instance Tree")
# if there is an ongoing upgrade decision, skip, since there is already
# a ticket for handle the inconsistency.
if portal.portal_catalog.getResultValue(
portal_type='Upgrade Decision',
aggregate__uid=instance_tree.getUid(),
simulation_state=['started', 'stopped', 'planned', 'confirmed']) is not None:
continue
instance_tree_context = instance_tree.asContext(
default_source_reference=instance_software_type,
url_string=instance_software_release_url)
project = instance.getFollowUpValue()
assert project is not None, 'Project is None'
allocation_cell_list = []
software_product, software_release, software_type = instance_tree_context.InstanceTree_getSoftwareProduct()
if software_product is not None:
allocation_cell_list = project.Project_getSoftwareProductPredicateList(
software_product=software_product,
software_product_type=software_type,
software_product_release=software_release,
destination_value=instance_tree.getDestinationSectionValue(),
node_value=compute_node,
predicate_portal_type='Allocation Supply Cell'
)
if not allocation_cell_list:
# Sampling of the structure.
# error_dict[software_release_url][software_type] = (instance, compute_partition)
value = (instance, compute_partition)
if instance_software_release_url not in error_dict:
error_dict[instance_software_release_url] = {}
if instance_software_type not in error_dict[instance_software_release_url]:
error_dict[instance.getUrlString()][instance_software_type] = value
return error_dict
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<global name="PythonScript" module="Products.PythonScripts.PythonScript"/>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_bind_names</string> </key>
<value>
<object>
<klass>
<global name="_reconstructor" module="copy_reg"/>
</klass>
<tuple>
<global name="NameAssignments" module="Shared.DC.Scripts.Bindings"/>
<global name="object" module="__builtin__"/>
<none/>
</tuple>
<state>
<dictionary>
<item>
<key> <string>_asgns</string> </key>
<value>
<dictionary>
<item>
<key> <string>name_container</string> </key>
<value> <string>container</string> </value>
</item>
<item>
<key> <string>name_context</string> </key>
<value> <string>context</string> </value>
</item>
<item>
<key> <string>name_m_self</string> </key>
<value> <string>script</string> </value>
</item>
<item>
<key> <string>name_subpath</string> </key>
<value> <string>traverse_subpath</string> </value>
</item>
</dictionary>
</value>
</item>
</dictionary>
</state>
</object>
</value>
</item>
<item>
<key> <string>_params</string> </key>
<value> <string>known_error_dict=None</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>ComputePartition_checkAllocationConsistencyState</string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
...@@ -48,6 +48,68 @@ if context.getPortalType() == 'Slave Instance' and compute_node.getPortalType() ...@@ -48,6 +48,68 @@ if context.getPortalType() == 'Slave Instance' and compute_node.getPortalType()
error_dict['ticket_description'] = error_dict['message'] error_dict['ticket_description'] = error_dict['message']
return error_dict return error_dict
sla_dict = context.getSlaXmlAsDict()
instance_sla_error_list = []
instance_project_reference = context.getFollowUpReference()
project_reference = compute_node.getFollowUpReference()
if project_reference != instance_project_reference:
instance_sla_error_list.append("Instance and Compute node project do not match on: %s (%s != %s)" % (
context.getTitle(), project_reference, instance_project_reference))
if sla_dict:
instance_title = context.getTitle()
# Simple check of instance SLAs
if "computer_guid" in sla_dict:
computer_guid = sla_dict.pop("computer_guid")
if compute_node.getReference() != computer_guid:
instance_sla_error_list.append('computer_guid do not match on: %s (%s != %s)' % (
instance_title, computer_guid, compute_node.getReference()))
if "instance_guid" in sla_dict:
instance_guid = sla_dict.pop("instance_guid")
if context.getPortalType() != 'Slave Instance':
instance_sla_error_list.append('instance_guid is provided to a Software Instance: %s' % instance_title)
else:
if compute_node.getPortalType() == "Remote Node":
instance_sla_error_list.append('instance_guid provided on %s and it is allocated on a REMOTE NODE' % instance_title)
else:
software_instance = compute_partition.getAggregateRelatedValue(portal_type='Software Instance')
if software_instance is not None and software_instance.getReference() != instance_guid:
instance_sla_error_list.append('instance_guid do not match on: %s (%s != %s)' % (
instance_title, instance_guid, software_instance.getReference()))
if 'network_guid' in sla_dict:
network_guid = sla_dict.pop('network_guid')
network_reference = compute_node.getSubordinationReference()
if network_reference != network_guid:
instance_sla_error_list.append('network_guid do not match on: %s (%s != %s)' % (
instance_title, network_guid, network_reference))
if 'project_guid' in sla_dict:
project_guid = sla_dict.pop("project_guid")
if project_reference != project_guid:
instance_sla_error_list.append('project_guid do not match on: %s (%s != %s)' % (
instance_title, project_guid, project_reference))
if instance_sla_error_list:
# Slave instance is allocated but the software instance was already destroyed
error_dict['notification_message_reference'] = 'slapos-crm-instance-tree-has-invalid-sla.notification'
error_dict = updateErrorDictWithError(error_dict)
error_text = ""
for _e in instance_sla_error_list:
error_text += " * %s\n" % _e
error_dict['error_text'] = error_text
error_dict['ticket_description'] = """%s has invalid Service Level Aggrement.
Detected inconsistencies:
%s""" % (instance_title, error_text)
error_dict['message'] = error_dict['ticket_description']
return error_dict
# Skip to check if monitor disabled on the compute node. # Skip to check if monitor disabled on the compute node.
# Remote node has no state. # Remote node has no state.
if (compute_node.getPortalType() == "Compute Node") and (compute_node.getMonitorScope() != "enabled"): if (compute_node.getPortalType() == "Compute Node") and (compute_node.getMonitorScope() != "enabled"):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment