Commit 143c4978 authored by Rafael Monnerat's avatar Rafael Monnerat

agent: Make test agent more tolerant to network failures.

parent b5999ac3
...@@ -11,6 +11,8 @@ import tempfile ...@@ -11,6 +11,8 @@ import tempfile
import slapos.slap import slapos.slap
from slapos.slap.slap import ConnectionError
from slapos.grid.utils import setRunning, setFinished from slapos.grid.utils import setRunning, setFinished
from erp5.util.taskdistribution import TaskDistributor, TaskDistributionTool from erp5.util.taskdistribution import TaskDistributor, TaskDistributionTool
...@@ -190,7 +192,7 @@ def main(): ...@@ -190,7 +192,7 @@ def main():
while True: while True:
section_dict = loadConfiguration(configuration, logger) section_dict = loadConfiguration(configuration, logger)
agent_parameter_dict = dict(configuration.items('agent')) agent_parameter_dict = dict(configuration.items('agent'))
task_distributor = TaskDistributor(agent_parameter_dict['report_url']) task_distributor = TaskDistributor(agent_parameter_dict['report_url'])
...@@ -279,8 +281,22 @@ def main(): ...@@ -279,8 +281,22 @@ def main():
assert master_url.startswith('https:') assert master_url.startswith('https:')
slap = slapos.slap.slap() slap = slapos.slap.slap()
slap.initializeConnection( retry = 0
master_url, key_file, cert_file) while True:
if retry > 100:
break
# wait until _hateoas_navigator is loaded.
slap.initializeConnection(
master_url, key_file, cert_file, timeout=120)
if getattr(slap, '_hateoas_navigator', None) is None:
logger.info("Fail to load _hateoas_navigator waiting a bit and retry.")
time.sleep(30)
else:
break
if getattr(slap, '_hateoas_navigator', None) is None:
raise ValueError("Fail to load _hateoas_navigator")
supply = slap.registerSupply() supply = slap.registerSupply()
order = slap.registerOpenOrder() order = slap.registerOpenOrder()
...@@ -357,6 +373,10 @@ def main(): ...@@ -357,6 +373,10 @@ def main():
logger.info('Checking %s: %r...', section, tester) logger.info('Checking %s: %r...', section, tester)
try: try:
deadline = tester.tic(now) deadline = tester.tic(now)
except ConnectionError:
logger.exception('Test execution ConnectionError for %s' % (section))
deadline = next_deadline
except Exception: except Exception:
logger.exception('Test execution fail for %s' % (section)) logger.exception('Test execution fail for %s' % (section))
test_line.stop(test_count=1, error_count=1, failure_count=0, test_line.stop(test_count=1, error_count=1, failure_count=0,
......
...@@ -9,6 +9,10 @@ from uritemplate import expand ...@@ -9,6 +9,10 @@ from uritemplate import expand
import slapos.slap import slapos.slap
from slapos.slap import SoftwareProductCollection from slapos.slap import SoftwareProductCollection
from slapos.slap.slap import ConnectionError
from requests.exceptions import HTTPError
from erp5.util.taskdistribution import SAFE_RPC_EXCEPTION_LIST from erp5.util.taskdistribution import SAFE_RPC_EXCEPTION_LIST
SOFTWARE_PRODUCT_NAMESPACE = "product." SOFTWARE_PRODUCT_NAMESPACE = "product."
...@@ -36,6 +40,8 @@ TESTER_STATE_INSTANCE_UNINSTALLED = "TESTER_STATE_INSTANCE_UNINSTALLED" ...@@ -36,6 +40,8 @@ TESTER_STATE_INSTANCE_UNINSTALLED = "TESTER_STATE_INSTANCE_UNINSTALLED"
class TestTimeout(Exception): class TestTimeout(Exception):
pass pass
# Simple decorator to prevent raise due small # Simple decorator to prevent raise due small
# network failures. # network failures.
def retryOnNetworkFailure(func): def retryOnNetworkFailure(func):
...@@ -46,7 +52,11 @@ def retryOnNetworkFailure(func): ...@@ -46,7 +52,11 @@ def retryOnNetworkFailure(func):
return func(*args, **kwargs) return func(*args, **kwargs)
except SAFE_RPC_EXCEPTION_LIST, e: except SAFE_RPC_EXCEPTION_LIST, e:
print 'Network failure: %s , %s' % (sys.exc_info(), e) print 'Network failure: %s , %s' % (sys.exc_info(), e)
except slapos.slap.slap.ConnectionError, e: except HTTPError, e:
print 'Network failure: %s , %s' % (sys.exc_info(), e)
except ConnectionError, e:
print 'Network failure: %s , %s' % (sys.exc_info(), e)
except slapos.slap.ConnectionError, e:
print 'Network failure: %s , %s' % (sys.exc_info(), e) print 'Network failure: %s , %s' % (sys.exc_info(), e)
print 'Retry method %s in %i seconds' % (func, retry_time) print 'Retry method %s in %i seconds' % (func, retry_time)
...@@ -101,7 +111,7 @@ class SlapOSMasterCommunicator(object): ...@@ -101,7 +111,7 @@ class SlapOSMasterCommunicator(object):
state=state, state=state,
**self.request_kw) **self.request_kw)
@retryOnNetworkFailure
def _hateoas_getComputer(self, reference): def _hateoas_getComputer(self, reference):
root_document = self.hateoas_navigator.getRootDocument() root_document = self.hateoas_navigator.getRootDocument()
...@@ -126,6 +136,7 @@ class SlapOSMasterCommunicator(object): ...@@ -126,6 +136,7 @@ class SlapOSMasterCommunicator(object):
return json.loads(self.hateoas_navigator.GET(getter_url)) return json.loads(self.hateoas_navigator.GET(getter_url))
@retryOnNetworkFailure
def getSoftwareInstallationList(self): def getSoftwareInstallationList(self):
# XXX Move me to slap.py API # XXX Move me to slap.py API
...@@ -143,6 +154,7 @@ class SlapOSMasterCommunicator(object): ...@@ -143,6 +154,7 @@ class SlapOSMasterCommunicator(object):
return json.loads(result)['_links']['content'] return json.loads(result)['_links']['content']
@retryOnNetworkFailure
def getSoftwareInstallationNews(self): def getSoftwareInstallationNews(self):
for si in self.getSoftwareInstallationList(): for si in self.getSoftwareInstallationList():
if si["title"] == self.url: if si["title"] == self.url:
...@@ -164,10 +176,12 @@ class SlapOSMasterCommunicator(object): ...@@ -164,10 +176,12 @@ class SlapOSMasterCommunicator(object):
return json.loads(result)['news'][0]["text"] return json.loads(result)['news'][0]["text"]
return "" return ""
@retryOnNetworkFailure
def getInstanceUrlList(self): def getInstanceUrlList(self):
if self.hosting_subscription_url is None: if self.hosting_subscription_url is None:
for hs in self.hateoas_navigator._hateoas_getHostingSubscriptionDict(): hosting_subscription_dict = self.hateoas_navigator._hateoas_getHostingSubscriptionDict()
for hs in hosting_subscription_dict:
if hs['title'] == self.name: if hs['title'] == self.name:
self.hosting_subscription_url = hs['href'] self.hosting_subscription_url = hs['href']
break break
...@@ -176,8 +190,9 @@ class SlapOSMasterCommunicator(object): ...@@ -176,8 +190,9 @@ class SlapOSMasterCommunicator(object):
return None return None
return self.hateoas_navigator.getHateoasInstanceList( return self.hateoas_navigator.getHateoasInstanceList(
self.hosting_subscription_url) self.hosting_subscription_url)
@retryOnNetworkFailure
def getNewsFromInstance(self, url): def getNewsFromInstance(self, url):
result = self.hateoas_navigator.GET(url) result = self.hateoas_navigator.GET(url)
...@@ -191,11 +206,13 @@ class SlapOSMasterCommunicator(object): ...@@ -191,11 +206,13 @@ class SlapOSMasterCommunicator(object):
result = self.hateoas_navigator.GET(object_link) result = self.hateoas_navigator.GET(object_link)
return json.loads(result)['news'] return json.loads(result)['news']
@retryOnNetworkFailure
def getInformationFromInstance(self, url): def getInformationFromInstance(self, url):
result = self.hateoas_navigator.GET(url) result = self.hateoas_navigator.GET(url)
result = json.loads(result) result = json.loads(result)
if result['_links'].get('action_object_slap', None) is None: if result['_links'].get('action_object_slap', None) is None:
print result['links']
return None return None
object_link = self.hateoas_navigator.hateoasGetLinkFromLinks( object_link = self.hateoas_navigator.hateoasGetLinkFromLinks(
...@@ -339,6 +356,7 @@ class SoftwareReleaseTester(SlapOSMasterCommunicator): ...@@ -339,6 +356,7 @@ class SoftwareReleaseTester(SlapOSMasterCommunicator):
return SOFTWARE_STATE_UNKNOWN return SOFTWARE_STATE_UNKNOWN
@retryOnNetworkFailure
def getRSSEntryFromMonitoring(self, base_url): def getRSSEntryFromMonitoring(self, base_url):
if base_url is None: if base_url is None:
return {} return {}
...@@ -353,6 +371,7 @@ class SoftwareReleaseTester(SlapOSMasterCommunicator): ...@@ -353,6 +371,7 @@ class SoftwareReleaseTester(SlapOSMasterCommunicator):
return {} return {}
@retryOnNetworkFailure
def _getInstanceState(self): def _getInstanceState(self):
latest_state = self.latest_state latest_state = self.latest_state
self._logger.debug('latest_state = %r', latest_state) self._logger.debug('latest_state = %r', latest_state)
...@@ -446,6 +465,7 @@ class SoftwareReleaseTester(SlapOSMasterCommunicator): ...@@ -446,6 +465,7 @@ class SoftwareReleaseTester(SlapOSMasterCommunicator):
if stopped: if stopped:
return INSTANCE_STATE_STOPPED return INSTANCE_STATE_STOPPED
@retryOnNetworkFailure
def teardown(self): def teardown(self):
""" """
Interrupt a running test sequence, putting it in idle state. Interrupt a running test sequence, putting it in idle state.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment