Commit 20dbeadb by Cédric de Saint Martin

Merge branch 'kvmresiliency'

parents b0ecfe66 62d518d8
......@@ -72,6 +72,26 @@ class EqueueServer(SocketServer.ThreadingUnixStreamServer):
def setDB(self, database):
self.db = gdbm.open(database, 'cs', 0700)
def _runCommandIfNeeded(self, command, timestamp):
with self.lock:
if command in self.db and timestamp <= int(self.db[command]):
self.logger.info("%s already run.", command)
return
self.logger.info("Running %s, %s with output:", command, timestamp)
try:
self.logger.info(
subprocess.check_output([command], stderr=subprocess.STDOUT)
)
self.logger.info("%s finished successfully.", command)
except subprocess.CalledProcessError as e:
self.logger.warning("%s exited with status %s. output is: \n %s" % (
command,
e.returncode,
e.output,
))
self.db[command] = timestamp
def process_request_thread(self, request, client_address):
# Handle request
self.logger.debug("Connection with file descriptor %d", request.fileno())
......@@ -102,23 +122,7 @@ class EqueueServer(SocketServer.ThreadingUnixStreamServer):
except:
self.logger.warning("Couldn't respond to %r", request.fileno())
self.close_request(request)
# Run command if needed
with self.lock:
if command not in self.db or timestamp > int(self.db[command]):
self.logger.info("Running %s, %s", command, timestamp)
# XXX stdout and stderr redirected to null as they are not read
with open(os.devnull, 'r+') as f:
status = subprocess.call([command], close_fds=True,
stdin=f, stdout=f, stderr=f)
if status:
self.logger.warning("%s finished with non zero status.",
command)
else:
self.logger.info("%s finished successfully.", command)
self.db[command] = timestamp
else:
self.logger.info("%s already runned.", command)
self._runCommandIfNeeded(command, timestamp)
# Well the following function is made for schrodinger's files,
# It will work if the file exists or not
def remove_existing_file(path):
......
......@@ -15,3 +15,32 @@ This module contains:
* A Resiliency Test Suite framework (in suites/), used to easily write new
test suites
* A list of test suites
TODO :
* Check that each partition is in different slapos node.
* Test for bang calls
* Be able to configure from ERP5 Master (i.e from instance parameters): count of PBS/clones, then test several possibilities (so called "count" in test suite)
* Use Nexedi ERP5, when in production.
* Put the KVM disk image in a safe place.
------------
For reference: How-to deploy the whole test system
1/ Deploy a SlapOS Master
2/ Deploy an ERP5, install erp5_test_result BT with scalability feature (current in scalability-master2 branch of erp5.git) (currently, had to change a few lines in the scalability extension of the portal_class, should be commited)
3/ Configure 3 nodes in the new SlapOS Master, deploy in each a testnode with scalability feature (erp5testnode-scalability branch of slapos.git) with parameters like:
<?xml version="1.0" encoding="utf-8"?>
<instance>
<parameter id="test-node-title">COMP-0-Testnode</parameter>
<parameter id="test-suite-master-url">https://zope:insecure@softinst43496.host.vifib.net/erp5/portal_task_distribution/1</parameter>
</instance>
3bis/ Supply and request http://git.erp5.org/gitweb/slapos.git/blob_plain/refs/tags/slapos-0.92:/software/kvm/software.cfg on a public node (so that vnc frontends are ok). "domain" parameter should be [ipv6] of partition. ipv4:4443 should be 6tunnelled to ipv6:4443 (Note: here, instead, I just hacked kvm_frontend to listen on ipv6).
3ter/ Supply and request http://git.erp5.org/gitweb/slapos.git/blob_plain/HEAD:/software/apache-frontend/software.cfg, with any "domain" (it won't be used), on a public node (so that web frontends are ok)
4/ On the ERP5 instance, create a project, a Task Distribution (in portal_task_distribution, type Scalability Task Distribution)
5/ On the ERP5 instance, create a Test Suite, validate it
Note: the slapos nodes are currently deployed using slapos-in-partition.
Note: you have to manually kill -10 the erp5testnode process to start deployment of test because it doesn't know when SR installation is finished.
Note: you have to manually run slapos-node-software --all on the slapos nodes if you are developping the SR you are testing.
......@@ -37,10 +37,6 @@ import traceback
from erp5.util import taskdistribution
from erp5.util.testnode import Utils
MAX_INSTALLATION_TIME = 60 * 50
MAX_TESTING_TIME = 60
MAX_GETTING_CONNECTION_TIME = 60 * 5
def importFrom(name):
"""
Import a test suite module (in the suites module) and return it.
......@@ -147,7 +143,7 @@ class ScalabilityLauncher(object):
Return a ScalabilityTest with current running test case informations,
or None if no test_case ready
"""
data = self.test_result.getNextTestCase()
data = self.test_result.getRunningTestCase()
if data == None:
return None
decoded_data = Utils.deunicodeData(json.loads(
......
......@@ -42,6 +42,10 @@ import urllib
logger = logging.getLogger('KVMResiliencyTest')
# Wait for 2 hours before renaming, so that replication of data is done
# (~1GB of data to backup)
SLEEP_TIME = 2 * 60 * 60
def fetchMainInstanceIP(current_partition, software_release, instance_name):
return current_partition.request(
software_release=software_release,
......@@ -154,23 +158,30 @@ def runTestSuite(server_url, key_file, cert_file,
# Test each clone
while current_clone <= clone_count:
logger.info('Testing kvm%s.' % current_clone)
# Wait for XX minutes so that replication is done
sleep_time = 60 * 15#2 * 60 * 60
logger.info('Sleeping for %s seconds.' % sleep_time)
time.sleep(sleep_time)
logger.info('Sleeping for %s seconds.' % SLEEP_TIME)
time.sleep(SLEEP_TIME)
# Make the clone instance takeover the main instance
logger.info('Replacing main instance by clone instance...')
takeover(
server_url=server_url,
key_file=key_file,
cert_file=cert_file,
computer_guid=computer_id,
partition_id=partition_id,
software_release=software,
namebase=namebase,
winner_instance_suffix=str(current_clone),
)
for i in range(0, 10):
try:
takeover(
server_url=server_url,
key_file=key_file,
cert_file=cert_file,
computer_guid=computer_id,
partition_id=partition_id,
software_release=software,
namebase=namebase,
winner_instance_suffix=str(current_clone),
)
break
except: # SSLError
traceback.print_exc()
if i is 9:
raise
logger.warning('takeover failed. Retrying...')
time.sleep(10)
logger.info('Done.')
# Wait for the new IP (of old-clone new-main instance) to appear.
......
......@@ -42,6 +42,7 @@ class ResiliencyTestSuite(object):
computer_id, partition_id, software,
namebase,
root_instance_name,
sleep_time_between_test=600,
total_instance_count="3"):
self.server_url = server_url
self.key_file = key_file
......@@ -52,6 +53,7 @@ class ResiliencyTestSuite(object):
self.namebase = namebase
self.total_instance_count = total_instance_count
self.root_instance_name = root_instance_name
self.sleep_time_between_test = sleep_time_between_test
slap = slapos.slap.slap()
slap.initializeConnection(server_url, key_file, cert_file)
......@@ -151,12 +153,12 @@ class ResiliencyTestSuite(object):
# Test each clone
while current_clone <= clone_count:
# Wait for XX minutes so that replication is done
sleep_time = 60 * 15#2 * 60 * 60
self.logger.info('Sleeping for %s seconds before testing clone %s.' % (
sleep_time,
self.sleep_time_between_test,
current_clone
))
time.sleep(sleep_time)
time.sleep(self.sleep_time_between_test)
self._doTakeover(current_clone)
self.logger.info('Testing %s%s instance.' % (self.namebase, current_clone))
success = self.checkDataOnCloneInstance()
......
......@@ -50,22 +50,19 @@ class SlaprunnerTestSuite(ResiliencyTestSuite):
# Setup urllib2 with cookie support
cookie_jar = cookielib.CookieJar()
self._opener_director = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar))
self._opener_director = urllib2.build_opener(
urllib2.HTTPCookieProcessor(cookie_jar)
)
ResiliencyTestSuite.__init__(
self,
server_url, key_file, cert_file,
computer_id, partition_id, software,
namebase,
slaprunner_rootinstance_name
slaprunner_rootinstance_name,
300
)
def generateData(self):
self.slaprunner_password = ''.join(random.SystemRandom().sample(string.ascii_lowercase, 8))
self.slaprunner_user = 'slapos'
self.logger.info('Generated slaprunner user is: %s' % self.slaprunner_user)
self.logger.info('Generated slaprunner password is: %s' % self.slaprunner_password)
def _connectToSlaprunner(self, resource, data=None):
"""
Utility.
......@@ -84,23 +81,92 @@ class SlaprunnerTestSuite(ResiliencyTestSuite):
def _login(self):
self.logger.debug('Logging in...')
self._connectToSlaprunner('doLogin', data='clogin=%s&cpwd=%s' % (self.slaprunner_user, self.slaprunner_password))
self._connectToSlaprunner('doLogin', data='clogin=%s&cpwd=%s' % (
self.slaprunner_user,
self.slaprunner_password)
)
def _retrieveInstanceLogFile(self):
"""
Store the logfile (=data) of the instance, check it is not empty nor it is html.
Store the logfile (=data) of the instance, check it is not empty nor it is
html.
"""
data = self._connectToSlaprunner(resource='fileBrowser', data='opt=9&filename=log.log&dir=instance_root%252Fslappart0%252Fvar%252Flog%252F')
data = self._connectToSlaprunner(
resource='fileBrowser',
data='opt=9&filename=log.log&dir=instance_root%252Fslappart0%252Fvar%252Flog%252F'
)
self.logger.info('Retrieved data are:\n%s' % data)
if data.find('<') is not -1:
raise IOError('Could not retrieve logfile content: retrieved content is html.')
raise IOError(
'Could not retrieve logfile content: retrieved content is html.'
)
if data.find('Could not load') is not -1:
raise IOError('Could not retrieve logfile content: server could not load the file.')
raise IOError(
'Could not retrieve logfile content: server could not load the file.'
)
if data.find('Hello') is -1:
raise IOError('Could not retrieve logfile content: retrieve content does not match "Hello".')
raise IOError(
'Could not retrieve logfile content: retrieve content does not match "Hello".'
)
return data
def _waitForSoftwareBuild(self):
while self._connectToSlaprunner(resource='slapgridResult', data='position=0&log=').find('"software": true') is not -1:
self.logger.info('Software release is still building. Sleeping...')
time.sleep(15)
self.logger.info('Software Release has been built / is no longer building.')
def _buildSoftwareRelease(self):
self.logger.info('Building the Software Release...')
try:
self._connectToSlaprunner(resource='runSoftwareProfile')
except (NotHttpOkException, urllib2.HTTPError):
# The nginx frontend might timeout before software release is finished.
pass
self._waitForSoftwareBuild()
def _deployInstance(self):
self.logger.info('Deploying instance...')
try:
self._connectToSlaprunner(resource='runInstanceProfile')
except (NotHttpOkException, urllib2.HTTPError):
# The nginx frontend might timeout before someftware release is finished.
pass
while self._connectToSlaprunner(resource='slapgridResult', data='position=0&log=').find('"instance": true') is not -1:
self.logger.info('Buildout is still running. Sleeping...')
time.sleep(15)
self.logger.info('Instance has been deployed.')
def _gitClone(self):
self.logger.debug('Doing git clone of git.erp5.org/repos/slapos.git...')
try:
self._connectToSlaprunner(
resource='cloneRepository',
data='repo=http://git.erp5.org/repos/slapos.git&name=workspace/slapos&email=slapos@slapos.org&user=slapos'
)
except (NotHttpOkException, urllib2.HTTPError):
# cloning can be very long.
# XXX: quite dirty way to check.
while self._connectToSlaprunner('getProjectStatus', data='project=workspace/slapos').find('On branch master') is -1:
self.logger.info('git-cloning ongoing, sleeping...')
def _openSoftwareRelease(self, software_name):
self.logger.debug('Opening %s software release...' % software_name)
self._connectToSlaprunner(
resource='setCurrentProject',
data='path=workspace/slapos/software/%s/' % software_name
)
def generateData(self):
self.slaprunner_password = ''.join(
random.SystemRandom().sample(string.ascii_lowercase, 8)
)
self.slaprunner_user = 'slapos'
self.logger.info('Generated slaprunner user is: %s' % self.slaprunner_user)
self.logger.info('Generated slaprunner password is: %s' % self.slaprunner_password)
def pushDataOnMainInstance(self):
"""
Create a dummy Software Release,
......@@ -117,47 +183,26 @@ class SlaprunnerTestSuite(ResiliencyTestSuite):
slaprunner_recovery_code = parameter_dict['password_recovery_code']
self.logger.debug('Creating the slaprunner account...')
self._connectToSlaprunner(resource='configAccount', data='name=slapos&username=%s&email=slapos@slapos.org&password=%s&rcode=%s' % (self.slaprunner_user, self.slaprunner_password, slaprunner_recovery_code))
self._connectToSlaprunner(
resource='configAccount',
data='name=slapos&username=%s&email=slapos@slapos.org&password=%s&rcode=%s' % (
self.slaprunner_user,
self.slaprunner_password,
slaprunner_recovery_code
)
)
self._login()
self.logger.debug('Opening hello-world software release from git...')
try:
self._connectToSlaprunner(resource='cloneRepository', data='repo=http://git.erp5.org/repos/slapos.git&name=workspace/slapos&email=slapos@slapos.org&user=slapos')
except (NotHttpOkException, urllib2.HTTPError):
# cloning can be very long.
# XXX: quite dirty way to check.
while self._connectToSlaprunner('getProjectStatus', data='project=workspace/slapos').find('On branch master') is -1:
self.logger.info('git-cloning ongoing, sleeping...')
self._gitClone()
# XXX should be taken from parameter.
self._connectToSlaprunner(resource='setCurrentProject', data='path=workspace/slapos/software/helloworld/')
self._openSoftwareRelease('helloworld')
self.logger.info('Building the Software Release...')
try:
self._connectToSlaprunner(resource='runSoftwareProfile')
except (NotHttpOkException, urllib2.HTTPError):
# The nginx frontend might timeout before software release is finished.
pass
while self._connectToSlaprunner(resource='slapgridResult', data='position=0&log=').find('"software": true') is not -1:
self.logger.info('Buildout is still running. Sleeping...')
time.sleep(15)
self.logger.info('Software Release has been built.')
self.logger.info('Deploying instance...')
try:
self._connectToSlaprunner(resource='runInstanceProfile')
except (NotHttpOkException, urllib2.HTTPError):
# The nginx frontend might timeout before someftware release is finished.
pass
while self._connectToSlaprunner(resource='slapgridResult', data='position=0&log=').find('"instance": true') is not -1:
self.logger.info('Buildout is still running. Sleeping...')
time.sleep(15)
self.logger.info('Instance has been deployed.')
self._buildSoftwareRelease()
self._deployInstance()
self.data = self._retrieveInstanceLogFile()
def checkDataOnCloneInstance(self):
"""
Check that:
......@@ -173,6 +218,10 @@ class SlaprunnerTestSuite(ResiliencyTestSuite):
old_parameter_value=old_slaprunner_backend_url
)
self._login()
self._waitForSoftwareBuild()
# XXX: in theory, it should be done automatically by slaprunner.
# In practice, it is still too dangerous for ERP5 instances.
self._deployInstance()
new_data = self._retrieveInstanceLogFile()
if new_data == self.data:
......
......@@ -32,8 +32,14 @@ $(document).ready(function () {
$("#error").Popup(data.result, {type: 'alert', duration: 3000});
}
})
.error(function () {
$("#error").Popup("Cannot send your account identifier please try again!!",
.error(function (response) {
if (response && response.status === 401) {
$("#error").Popup('Login and/or password is incorrect.',
{type: 'alert', duration: 3000}
);
return
}
$("#error").Popup("Cannot send your account identifier",
{type: 'alert', duration: 3000});
})
.complete(function () {
......
......@@ -118,7 +118,7 @@ def doLogin():
# Authenticate and log in!
if g.users[username].authenticate(request.form['cpwd']):
return jsonify(code=1, result="")
return jsonify(code=0, result="Login or password is incorrect, please check it!")
return jsonify(code=0, result="Login or password is incorrect, please check it!"), 401
# software views
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment