Task Distribution: give a chance for test suite to finish when testnodes are missing

6a0c7290 · Sebastien Robin · b96534a5 · 6a0c7290 · 6a0c7290
Commit 6a0c7290 authored May 31, 2017 by Sebastien Robin
2 changed files
--- a/bt5/erp5_test_result/TestTemplateItem/portal_components/test.erp5.testTaskDistribution.py
+++ b/bt5/erp5_test_result/TestTemplateItem/portal_components/test.erp5.testTaskDistribution.py
@@ -475,6 +475,12 @@ class TestTaskDistribution(ERP5TypeTestCase):
    checkTestResultLine([('testBar', 'started'), ('testFoo', 'stopped')])
  def test_07_reportTaskFailure(self):
+    """
+    When all test nodes report failures, we should mark the test result as
+    failed. If we do not do so, test node would always pickup same repository
+    revision and might fail with same failure forever (for example, a slapos
+    build issue).
+    """
    test_result_path, revision = self._createTestResult(node_title="Node0")
    next_test_result_path, revision = self._createTestResult(node_title="Node1")
    self.assertEqual(test_result_path, next_test_result_path)
@@ -493,6 +499,48 @@ class TestTaskDistribution(ERP5TypeTestCase):
    self.assertEqual("failed", test_result.getSimulationState())
    checkNodeState("failed", "failed")
+  def test_07b_reportTaskFailureWithRunningTest(self):
+    """
+    Similar to above test. Though, sometimes there is failure reported only because
+    runTestSuite reached timeout. This happens when not enough testnode are working
+    on a very long test suite. So code investigate if tests looked working fine, and
+    it might try to not cancel test result if there is chance that tests could be
+    continued.
+    For example :
+    - testnode0 start test suite Foo with revision r0 which would take 6 hours (other
+      testnodes are busy)
+    - after 4 hours, runTestSuite reach timeout of 4 hours (value set in test nodes).
+      thus it report a failure. We do not cancel the test result since everything went
+      fine up to know
+    - after some time testnode0 come back to run test suite Foo, revision r0, and
+      just do the 2 remaining hours. Test Suite can go up to the end even if we have
+      timeout smaller than total time for test suite.
+    """
+    now = DateTime()
+    try:
+      self.pinDateTime(now - 1.0/24*2)
+      test_result_path, revision = self._createTestResult(node_title="Node0",
+                                               test_list=['testFoo', 'testBar'])
+      test_result = self.getPortalObject().unrestrictedTraverse(test_result_path)
+      self.assertEqual("started", test_result.getSimulationState())
+      node, = test_result.objectValues(portal_type="Test Result Node",
+                                           sort_on=[("title", "ascending")])
+      self.assertEqual("started", node.getSimulationState())
+      line_url, test = self.tool.startUnitTest(test_result_path)
+      # We have a failure but with recent activities on tests
+      self.pinDateTime(now - 1.0/24*1.5)
+      self.tool.reportTaskFailure(test_result_path, {}, "Node0")
+      self.assertEqual("failed", node.getSimulationState())
+      self.assertEqual("started", test_result.getSimulationState())
+      # We have a failure but with no recent activities on tests
+      self.pinDateTime(now)
+      self.tool.reportTaskFailure(test_result_path, {}, "Node0")
+      self.assertEqual("failed", node.getSimulationState())
+      self.assertEqual("failed", test_result.getSimulationState())
+    finally:
+      self.unpinDateTime()
  def test_08_checkWeCanNotCreateTwoTestResultInParallel(self):
    """
    To avoid duplicates of test result when several testnodes works on the

--- a/product/ERP5/Tool/TaskDistributionTool.py
+++ b/product/ERP5/Tool/TaskDistributionTool.py
@@ -27,6 +27,7 @@
 ##############################################################################
 import random
+from DateTime import DateTime
 from AccessControl import ClassSecurityInfo
 from Products.ERP5Type import Permissions, PropertySheet, Constraint, interfaces
 from Products.ERP5Type.Tool.BaseTool import BaseTool
@@ -270,8 +271,17 @@ class TaskDistributionTool(BaseTool):
      if node.getSimulationState() != 'failed':
        break
    else:
-      if test_result.getSimulationState() not in ('failed', 'cancelled'):
+      # now check if we had recent work on test line, if so, this means
-        test_result.fail()
+      # we might just add timeout due to too much tests to execute for too
+      # little nodes. In that case we would like to continue the work later
+      recent_time = DateTime() - 1.0/24
+      for test_result_line in test_result.objectValues(
+          portal_type="Test Result Line"):
+        if test_result_line.getModificationDate() > recent_time:
+          break
+      else:
+        if test_result.getSimulationState() not in ('failed', 'cancelled'):
+          test_result.fail()
  security.declarePublic('reportTaskStatus')
  def reportTaskStatus(self, test_result_path, status_dict, node_title):