Commit f7eadb26 by Alain Takoudjou

grid.promise: implement a new promise launcher in slapgrid

python promises can define sense, test and anomaly method.
test method will be called by slapgrid or when no bang is needed for the promise anomaly method is
called when a promise failure required to bang the master, anomaly method can be optional but it's not the case for test method.

Slapgrid always run promises and save the result in .slapgrid/promise/result in a JSON format. The result will be used later by monitor
When a partition is correctly deployed, slapgrid will only run promise anomaly and will bang if there is an error and if the failed promise can bang

check promise anomaly when partition is upto date
1 parent 65c7a06d
# coding: utf-8
from zope.interface import Interface
class IPromise(Interface):
"""Base Promise interface."""
def __init__(config):
"""
@param config: Configurations needed to start the promise
"""
def anomaly(self):
"""
Called to detect if there is an anomaly.
@return AnomalyResult object
"""
def sense(self):
"""
Run the promise code and store the result
raise error, log error message, ... for failure
"""
def test(self):
"""
Test promise and say if problem is detected or not
@return TestResult object
"""
# -*- coding: utf-8 -*-
# vim: set et sts=2:
##############################################################################
#
# Copyright (c) 2018 Vifib SARL and Contributors.
# All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsibility of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# guarantees and support are strongly advised to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 3
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import subprocess
import functools
import signal
import traceback
from zope import interface as zope_interface
from slapos.grid.promise import interface
from slapos.grid.promise.generic import GenericPromise
class WrapPromise(GenericPromise):
"""
A wrapper promise used to run old promises style and bash promises
"""
zope_interface.implements(interface.IPromise)
def __init__(self, config):
GenericPromise.__init__(self, config)
self.setPeriodicity(minute=2)
@staticmethod
def terminate(name, logger, process, signum, frame):
if signum in [signal.SIGINT, signal.SIGTERM] and process:
logger.info("Terminating promise process %r" % name)
try:
# make sure we kill the process on timeout
process.terminate()
except Exception:
logger.error(traceback.format_exc())
def sense(self):
promise_process = subprocess.Popen(
[self.getPromiseFile()],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
cwd=self.getPartitionFolder()
)
handler = functools.partial(self.terminate, self.getName(), self.logger,
promise_process)
signal.signal(signal.SIGINT, handler)
signal.signal(signal.SIGTERM, handler)
output, error = promise_process.communicate()
message = output or ""
if error:
message += "\n" + error
if promise_process.returncode != 0:
self.logger.error(message.strip())
else:
self.logger.info(message.strip())
def test(self):
# Fail if the latest promise result failed
return self._test(result_count=1, failure_amount=1)
def anomaly(self):
# Fail if 3 latest promise result failed, no bang
return self._test(result_count=3, failure_amount=3)
......@@ -60,8 +60,9 @@ from slapos.grid.svcbackend import (launchSupervisord,
createSupervisordConfiguration,
_getSupervisordConfigurationDirectory,
_getSupervisordSocketPath)
from slapos.grid.utils import (md5digest, dropPrivileges, SlapPopen, updateFile,
checkPromiseList, PromiseError)
from slapos.grid.utils import (md5digest, dropPrivileges, SlapPopen, updateFile)
from slapos.grid.promise import PromiseLauncher, PromiseError
from slapos.grid.promise.generic import PROMISE_LOG_FOLDER_NAME
from slapos.human import human2bytes
import slapos.slap
from netaddr import valid_ipv4, valid_ipv6
......@@ -617,22 +618,39 @@ stderr_logfile_backups=1
return SLAPGRID_FAIL
return SLAPGRID_SUCCESS
def _checkPromiseList(self, computer_partition):
self.logger.info("Checking promises...")
instance_path = os.path.join(self.instance_root, computer_partition.getId())
def _checkPromiseList(self, partition, force=True, check_anomaly=False):
instance_path = os.path.join(self.instance_root, partition.partition_id)
promise_log_path = os.path.join(instance_path, PROMISE_LOG_FOLDER_NAME)
mkdir_p(promise_log_path)
self.logger.info("Checking %s promises..." % partition.partition_id)
uid, gid = None, None
stat_info = os.stat(instance_path)
#stat sys call to get statistics informations
uid = stat_info.st_uid
gid = stat_info.st_gid
promise_dir = os.path.join(instance_path, 'etc', 'promise')
if not checkPromiseList(promise_dir, self.promise_timeout, uid=uid, gid=gid,
cwd=instance_path, logger=self.logger, profile=False,
raise_on_failure=True):
self.logger.info("No promise.")
promise_dir = os.path.join(instance_path, 'etc', 'plugin')
legacy_promise_dir = os.path.join(instance_path, 'etc', 'promise')
promise_config = {
'promise-folder': promise_dir,
'legacy-promise-folder': legacy_promise_dir,
'promise-timeout': self.promise_timeout,
'uid': uid,
'gid': gid,
'partition-folder': instance_path,
'log-folder': promise_log_path,
'force': force,
'check-anomaly': check_anomaly,
'master-url': partition.server_url,
'partition-cert': partition.cert_file,
'partition-key': partition.key_file,
'partition-id': partition.partition_id,
'computer-id': self.computer_id,
}
promise_checker = PromiseLauncher(config=promise_config, logger=self.logger)
return promise_checker.run()
def _endInstallationTransaction(self, computer_partition):
partition_id = computer_partition.getId()
......@@ -777,7 +795,7 @@ stderr_logfile_backups=1
command, ip))
return cmd_list
def _getFirewallRejectRules(self, ip, hosting_ip_list, source_ip_list, ip_type='ipv4'):
"""
Generate rules for firewall based on list of IP that should not have access to `ip`
......@@ -946,6 +964,7 @@ stderr_logfile_backups=1
# Try to process it anyway, it may need to be deleted.
software_path = None
computer_partition_state = computer_partition.getState()
periodicity = self.maximum_periodicity
if software_path:
periodicity_path = os.path.join(software_path, 'periodicity')
......@@ -996,7 +1015,14 @@ stderr_logfile_backups=1
# Check periodicity, i.e if periodicity is one day, partition
# should be processed at least every day.
if int(time.time()) <= (last_runtime + periodicity) or periodicity < 0:
self.logger.debug('Partition already up-to-date, skipping.')
# check promises anomaly
if computer_partition_state == COMPUTER_PARTITION_STARTED_STATE:
self.logger.debug('Partition already up-to-date.')
self._checkPromiseList(local_partition,
check_anomaly=True,
force=False)
else:
self.logger.debug('Partition already up-to-date. skipping.')
return
else:
# Periodicity forced processing this partition. Removing
......@@ -1028,8 +1054,6 @@ stderr_logfile_backups=1
self.logger.info(' Software path: %s' % software_path)
self.logger.info(' Instance path: %s' % instance_path)
computer_partition_state = computer_partition.getState()
# XXX this line breaks 37 tests
# self.logger.info(' Instance type: %s' % computer_partition.getType())
self.logger.info(' Instance status: %s' % computer_partition_state)
......@@ -1046,7 +1070,7 @@ stderr_logfile_backups=1
if self.firewall_conf:
self._setupComputerPartitionFirewall(computer_partition,
partition_ip_list)
self._checkPromiseList(computer_partition)
self._checkPromiseList(local_partition)
computer_partition.started()
self._endInstallationTransaction(computer_partition)
elif computer_partition_state == COMPUTER_PARTITION_STOPPED_STATE:
......@@ -1085,11 +1109,18 @@ stderr_logfile_backups=1
with open(error_output_file, 'w') as error_file:
# Write error message in a log file assible to computer partition user
error_file.write(str(e))
raise
if not isinstance(e, PromiseError) and \
computer_partition_state == COMPUTER_PARTITION_STARTED_STATE:
try:
self._checkPromiseList(local_partition)
except PromiseError:
# updating promises state, no need to raise here
pass
raise e
else:
self.logger.removeHandler(partition_file_handler)
if os.path.exists(error_output_file):
os.unlink(error_output_file)
os.unlink(error_output_file)
# If partition has been successfully processed, write timestamp
if timestamp:
......
......@@ -192,7 +192,7 @@ def dropPrivileges(uid, gid, logger):
if uid == 0 or gid == 0:
raise OSError('Dropping privileges to uid = %r or '
'gid = %r is too dangerous' % (uid, gid))
if current_uid or current_gid:
if (current_uid or current_gid):
logger.debug('Running as uid = %r, gid = %r, dropping '
'not needed and not possible' % (current_uid, current_gid))
return
......
......@@ -912,10 +912,12 @@ class TestSlapgridCPWithMasterWatchdog(MasterMixin, unittest.TestCase):
partition.software.setBuildout(BUILDOUT_RUN_CONTENT)
self.assertEqual(self.grid.processComputerPartitionList(), slapgrid.SLAPGRID_SUCCESS)
time.sleep(1)
self.assertInstanceDirectoryListEqual(['0'])
self.assertItemsEqual(os.listdir(partition.partition_path),
['.slapgrid', '.0_daemon.log', 'buildout.cfg',
'etc', 'software_release', 'worked', '.slapos-retention-lock-delay'])
'etc', 'software_release', 'worked', '.slapos-retention-lock-delay',
'launched', 'crashed'])
daemon_log = os.path.join(partition.partition_path, '.0_daemon.log')
self.assertLogContent(daemon_log, 'Failing')
self.assertIsNotCreated(self.watchdog_banged)
......@@ -1867,14 +1869,16 @@ class TestSlapgridCPWithMasterPromise(MasterMixin, unittest.TestCase):
f.write(textwrap.dedent("""\
#!/usr/bin/env sh
touch "%s"
echo Error 1>&2
echo 'Error Promise 254554802' 1>&2
exit 127""" % worked_file))
os.chmod(succeed, 0o777)
self.assertEqual(self.grid.processComputerPartitionList(),
slapos.grid.slapgrid.SLAPGRID_PROMISE_FAIL)
self.assertTrue(os.path.isfile(worked_file))
self.assertEqual(instance.error_log[-5:], 'Error')
log_file = '%s/.slapgrid/log/instance.log' % instance.partition_path
with open(log_file) as f:
self.assertTrue('Error Promise 254554802' in f.read())
self.assertTrue(instance.error)
self.assertIsNone(instance.state)
......@@ -1986,6 +1990,58 @@ class TestSlapgridCPWithMasterPromise(MasterMixin, unittest.TestCase):
self.assertEquals(instance.error, 1)
self.assertNotEqual(instance.state, 'started')
def test_promise_run_if_partition_started_fail(self):
computer = ComputerForTest(self.software_root, self.instance_root)
with httmock.HTTMock(computer.request_handler):
instance = computer.instance_list[0]
instance.requested_state = 'started'
instance.software.setBuildout("""#!/bin/sh
exit 1
""")
self.assertEqual(self.grid.processComputerPartitionList(),
slapos.grid.slapgrid.SLAPGRID_FAIL)
self.assertInstanceDirectoryListEqual(['0'])
self.assertItemsEqual(os.listdir(instance.partition_path),
['.slapgrid', 'buildout.cfg', 'software_release',
'.slapgrid-0-error.log'])
promise_file = os.path.join(instance.partition_path, 'promise_ran')
promise = textwrap.dedent("""\
#!/usr/bin/env sh
touch "%s"
exit 127""" % promise_file)
instance.setPromise('promise_script', promise)
self.assertEqual(self.grid.processComputerPartitionList(),
slapos.grid.slapgrid.SLAPGRID_FAIL)
self.assertTrue(os.path.isfile(promise_file))
self.assertTrue(instance.error)
def test_promise_notrun_if_partition_stopped_fail(self):
computer = ComputerForTest(self.software_root, self.instance_root)
with httmock.HTTMock(computer.request_handler):
instance = computer.instance_list[0]
instance.requested_state = 'stopped'
instance.software.setBuildout("""#!/bin/sh
exit 1
""")
self.assertEqual(self.grid.processComputerPartitionList(),
slapos.grid.slapgrid.SLAPGRID_FAIL)
self.assertInstanceDirectoryListEqual(['0'])
self.assertItemsEqual(os.listdir(instance.partition_path),
['.slapgrid', 'buildout.cfg', 'software_release',
'.slapgrid-0-error.log'])
promise_file = os.path.join(instance.partition_path, 'promise_ran')
promise = textwrap.dedent("""\
#!/usr/bin/env sh
touch "%s"
exit 127""" % promise_file)
instance.setPromise('promise_script', promise)
self.assertEqual(self.grid.processComputerPartitionList(),
slapos.grid.slapgrid.SLAPGRID_FAIL)
self.assertFalse(os.path.exists(promise_file))
self.assertTrue(instance.error)
class TestSlapgridDestructionLock(MasterMixin, unittest.TestCase):
def test_retention_lock(self):
"""
......
Styling with Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!