Add --second-chance to the testrunner, and use it on CI.

Fixes #1719

Add --second-chance to the testrunner, and use it on CI.
Fixes #1719
abcd3a95 · Jason Madden · 2a28796d · abcd3a95 · abcd3a95 · abcd3a95
Commit abcd3a95 authored Dec 17, 2020 by Jason Madden
5 changed files
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -284,11 +284,8 @@ jobs:
          pip install -U pylint
          python -m pylint --limit-inference-results=1 --rcfile=.pylintrc gevent
      - name: "Tests: Basic"
-        # Run them twice if they fail; this is to workaround the inability to re-run
-        # single jobs. Ideally the testrunner should be updated to do this for just failing
-        # tests. See also make-manylinux.
        run: |
-          python -m gevent.tests $G_USE_COV || python -m gevent.tests
+          python -m gevent.tests --second-chance $G_USE_COV
      # For the CPython interpreters, unless we have reason to expect
      # different behaviour across the versions (e.g., as measured by coverage)
      # it's sufficient to run the full suite on the current version
@@ -303,21 +300,21 @@ jobs:
        env:
          GEVENT_FILE: thread
        run: |
-          python -m gevent.tests $G_USE_COV `(cd src/gevent/tests >/dev/null && ls test__*subprocess*.py)`
+          python -m gevent.tests --second-chance $G_USE_COV `(cd src/gevent/tests >/dev/null && ls test__*subprocess*.py)`
      - name: "Tests: c-ares resolver"
        # This sometimes fails on mac. Also, save mac minutes.
        if: (matrix.python-version == 2.7 || matrix.python-version == 3.9) && startsWith(runner.os, 'Linux')
        env:
          GEVENT_RESOLVER: ares
        run: |
-          python -mgevent.tests $G_USE_COV --ignore tests_that_dont_use_resolver.txt
+          python -mgevent.tests --second-chance $G_USE_COV --ignore tests_that_dont_use_resolver.txt
      - name: "Tests: dnspython resolver"
        # This has known issues on PyPy3. Also, save mac minutes.
        if: (matrix.python-version == 2.7 || matrix.python-version == 3.9) && startsWith(runner.os, 'Linux')
        env:
          GEVENT_RESOLVER: dnspython
        run: |
-          python -mgevent.tests $G_USE_COV --ignore tests_that_dont_use_resolver.txt
+          python -mgevent.tests --second-chance $G_USE_COV --ignore tests_that_dont_use_resolver.txt
      - name: "Tests: leakchecks"
        # Run the leaktests; this seems to be extremely slow on Python 3.7
        # XXX: Figure out why. Can we reproduce locally?
@@ -325,7 +322,7 @@ jobs:
        env:
          GEVENTTEST_LEAKCHECK: 1
        run: |
-          python -m gevent.tests --ignore tests_that_dont_do_leakchecks.txt
+          python -m gevent.tests --second-chance --ignore tests_that_dont_do_leakchecks.txt
      - name: "Tests: PURE_PYTHON"
        # No compiled cython modules on CPython, using the default backend. Get coverage here.
        # We should only need to run this for a single Python 2 and a Python 3
@@ -333,19 +330,19 @@ jobs:
        env:
          PURE_PYTHON: 1
        run: |
-          python -mgevent.tests --coverage || python -m gevent.tests
+          python -mgevent.tests --second-chance --coverage
      - name: "Tests: libuv"
        if: (matrix.python-version == 2.7 || matrix.python-version == 3.9)
        env:
          GEVENT_LOOP: libuv
        run: |
-          python -m gevent.tests $G_USE_COV || python -m gevent.tests
+          python -m gevent.tests --second-chance $G_USE_COV
      - name: "Tests: libev-cffi"
        if: (matrix.python-version == 2.7 || matrix.python-version == 3.9) && startsWith(runner.os, 'Linux')
        env:
          GEVENT_LOOP: libev-cffi
        run: |
-          python -m gevent.tests $G_USE_COV || python -m gevent.tests
+          python -m gevent.tests --second-chance $G_USE_COV
      - name: Report coverage
        if: ${{ !startsWith(matrix.python-version, 'pypy')  }}
        run: |
@@ -460,7 +457,7 @@ jobs:
          # Verify that we got non-embedded builds
          python -c 'import gevent.libev.corecffi as CF; assert not CF.LIBEV_EMBED'
          python -c 'import gevent.libuv.loop as CF; assert not CF.libuv.LIBUV_EMBED'
-          python -mgevent.tests
+          python -mgevent.tests --second-chance
  manylinux_x86_64:
    runs-on: ubuntu-latest

--- a/appveyor.yml
+++ b/appveyor.yml
@@ -187,7 +187,7 @@ test_script:
  - if not "%GWHEEL_ONLY%"=="true" %PYEXE% -c "import gevent.core; print(gevent.core.loop)"
  - if not "%GWHEEL_ONLY%"=="true" %PYEXE% -c "import gevent; print(gevent.config.settings['resolver'].get_options())"
  - if not "%GWHEEL_ONLY%"=="true" %PYEXE% -c "from gevent._compat import get_clock_info; print(get_clock_info('perf_counter'))"
-  - if not "%GWHEEL_ONLY%"=="true" %PYEXE% -mgevent.tests --config known_failures.py --quiet
+  - if not "%GWHEEL_ONLY%"=="true" %PYEXE% -mgevent.tests --second-chance --config known_failures.py --quiet
 after_test:
  # pycparser can't be built correctly in an isolated environment.

--- a/scripts/releases/make-manylinux
+++ b/scripts/releases/make-manylinux
@@ -134,9 +134,7 @@ if [ -d /gevent -a -d /opt/python ]; then
        fi
        if [ -z "$GEVENTTEST_SKIP_ALL" ]; then
-            # TODO: Make the testrunner automatically repeat flaky tests.
+            python -mgevent.tests --second-chance
-            # See the github action.
-            python -mgevent.tests || python -m gevent.tests
        else
            # Allow skipping the bulk of the tests. If we're emulating Arm,
            # running the whole thing takes forever.

--- a/src/gevent/testing/testrunner.py
+++ b/src/gevent/testing/testrunner.py
@@ -5,6 +5,7 @@ import re
 import sys
 import os
 import glob
+import operator
 import traceback
 import importlib
@@ -80,8 +81,12 @@ class ResultCollector(object):
        self.passed = {}
        self.total_cases = 0
        self.total_skipped = 0
+        # Every RunResult reported: failed, passed, rerun
+        self._all_results = []
    def __iadd__(self, result):
+        self._all_results.append(result)
        if not result:
            self.failed[result.name] = result #[cmd, kwargs]
        else:
@@ -90,6 +95,26 @@ class ResultCollector(object):
        self.total_skipped += result.skipped_count
        return self
+    def __ilshift__(self, result):
+        """
+        collector <<= result
+        Stores the result, but does not count it towards
+        the number of cases run, skipped, passed or failed.
+        """
+        self._all_results.append(result)
+        return self
+    @property
+    def longest_running_tests(self):
+        """
+        A new list of RunResult objects, sorted from longest running
+        to shortest running.
+        """
+        return sorted(self._all_results,
+                      key=operator.attrgetter('run_duration'),
+                      reverse=True)
 class FailFast(Exception):
    pass
@@ -105,7 +130,8 @@ class Runner(object):
                 failfast=False,
                 quiet=False,
                 configured_run_alone_tests=(),
-                 worker_count=DEFAULT_NWORKERS):
+                 worker_count=DEFAULT_NWORKERS,
+                 second_chance=False):
        """
        :keyword quiet: Set to True or False to explicitly choose. Set to
            `None` to use the default, which may come from the environment variable
@@ -113,10 +139,13 @@ class Runner(object):
        """
        self._tests = tests
        self._configured_failing_tests = configured_failing_tests
-        self._failfast = failfast
        self._quiet = quiet
        self._configured_run_alone_tests = configured_run_alone_tests
+        assert not (failfast and second_chance)
+        self._failfast = failfast
+        self._second_chance = second_chance
        self.results = ResultCollector()
        self.results.total = len(self._tests)
        self._running_jobs = []
@@ -127,6 +156,10 @@ class Runner(object):
        if self._quiet is not None:
            kwargs['quiet'] = self._quiet
        result = util.run(cmd, **kwargs)
+        if not result and self._second_chance:
+            self.results <<= result
+            util.log("> %s", result.name, color='warning')
+            result = util.run(cmd, **kwargs)
        if not result and self._failfast:
            # Under Python 3.9 (maybe older versions?), raising the
            # SystemExit here (a background thread belonging to the
@@ -221,12 +254,10 @@ class Runner(object):
    def _report(self, elapsed_time, exit=False):
        results = self.results
        report(
-            results.total, results.failed, results.passed,
+            results,
            exit=exit,
            took=elapsed_time,
            configured_failing_tests=self._configured_failing_tests,
-            total_cases=results.total_cases,
-            total_skipped=results.total_skipped
        )
@@ -482,7 +513,15 @@ class Discovery(object):
                module_name = os.path.splitext(filename)[0]
                qualified_name = self.package + '.' + module_name if self.package else module_name
-            with open(os.path.abspath(filename), 'rb') as f:
+            # Also allow just 'foo' as a shortcut for 'gevent.tests.foo'
+            abs_filename = os.path.abspath(filename)
+            if (
+                    not os.path.exists(abs_filename)
+                    and not filename.endswith('.py')
+                    and os.path.exists(abs_filename + '.py') ):
+                abs_filename = abs_filename + '.py'
+            with open(abs_filename, 'rb') as f:
                # Some of the test files (e.g., test__socket_dns) are
                # UTF8 encoded. Depending on the environment, Python 3 may
                # try to decode those as ASCII, which fails with UnicodeDecodeError.
@@ -583,18 +622,38 @@ def format_seconds(seconds):
    return seconds
-def report(total, failed, passed, exit=True, took=None,
+def _show_longest_running(result_collector, how_many=5):
-           configured_failing_tests=(),
+    longest_running_tests = result_collector.longest_running_tests
-           total_cases=0, total_skipped=0):
+    if not longest_running_tests:
+        return
+    # The only tricky part is handling repeats. we want to show them,
+    # but not count them as a distinct entry.
+    util.log('\nLongest-running tests:')
+    length_of_longest_formatted_decimal = len('%.1f' % longest_running_tests[0].run_duration)
+    frmt = '%' + str(length_of_longest_formatted_decimal) + '.1f seconds: %s'
+    seen_names = set()
+    for result in longest_running_tests:
+        util.log(frmt, result.run_duration, result.name)
+        seen_names.add(result.name)
+        if len(seen_names) >= how_many:
+            break
+def report(result_collector, # type: ResultCollector
+           exit=True, took=None,
+           configured_failing_tests=()):
    # pylint:disable=redefined-builtin,too-many-branches,too-many-locals
-    runtimelog = util.runtimelog # XXX: Global state!
+    total = result_collector.total
-    if runtimelog:
+    failed = result_collector.failed
-        util.log('\nLongest-running tests:')
+    passed = result_collector.passed
-        runtimelog.sort()
+    total_cases = result_collector.total_cases
-        length = len('%.1f' % -runtimelog[0][0])
+    total_skipped = result_collector.total_skipped
-        frmt = '%' + str(length) + '.1f seconds: %s'
-        for delta, name in runtimelog[:5]:
+    _show_longest_running(result_collector)
-            util.log(frmt, -delta, name)
    if took:
        took = ' in %s' % format_seconds(took)
    else:
@@ -745,11 +804,11 @@ def main():
    parser.add_argument('--discover', action='store_true')
    parser.add_argument('--full', action='store_true')
    parser.add_argument('--config', default='known_failures.py')
-    parser.add_argument('--failfast', '-x', action='store_true')
    parser.add_argument("--coverage", action="store_true")
    parser.add_argument("--quiet", action="store_true", default=True)
    parser.add_argument("--verbose", action="store_false", dest='quiet')
    parser.add_argument("--debug", action="store_true", default=False)
    parser.add_argument("--package", default="gevent.tests")
    parser.add_argument(
        "--processes", "-j", default=DEFAULT_NWORKERS, type=int,
@@ -768,9 +827,17 @@ def main():
                        'For example, "-u-network". GEVENTTEST_USE_RESOURCES is used '
                        'if no argument is given. To only use one resources, specify '
                        '"-unone,resource".')
    parser.add_argument("--travis-fold", metavar="MSG",
                        help="Emit Travis CI log fold markers around the output.")
+    fail_parser = parser.add_mutually_exclusive_group()
+    fail_parser.add_argument(
+        "--second-chance", action="store_true", default=False,
+        help="Give failed tests a second chance.")
+    fail_parser.add_argument(
+        '--failfast', '-x', action='store_true', default=False,
+        help="Stop running after the first failure.")
    parser.add_argument('tests', nargs='*')
    options = parser.parse_args()
    # options.use will be either None for not given, or a list
@@ -862,6 +929,7 @@ def main():
            quiet=options.quiet,
            configured_run_alone_tests=RUN_ALONE,
            worker_count=options.processes,
+            second_chance=options.second_chance,
        )
        if options.travis_fold:

--- a/src/gevent/testing/util.py
+++ b/src/gevent/testing/util.py
@@ -15,8 +15,6 @@ from gevent.monkey import get_original
 # pylint: disable=broad-except,attribute-defined-outside-init
-runtimelog = []
-MIN_RUNTIME = 1.0
 BUFFER_OUTPUT = False
 # This is set by the testrunner, defaulting to true (be quiet)
 # But if we're run standalone, default to false
@@ -272,7 +270,9 @@ class RunResult(object):
                 output=None, # type: str
                 error=None, # type: str
                 name=None,
-                 run_count=0, skipped_count=0):
+                 run_count=0, skipped_count=0,
+                 run_duration=0, # type: float
+                 ):
        self.command = command
        self.run_kwargs = run_kwargs
        self.code = code
@@ -281,6 +281,7 @@ class RunResult(object):
        self.name = name
        self.run_count = run_count
        self.skipped_count = skipped_count
+        self.run_duration = run_duration
    @property
    def output_lines(self):
@@ -383,7 +384,7 @@ def run(command, **kwargs): # pylint:disable=too-many-locals
    try:
        time_start = perf_counter()
        out, err = popen.communicate()
-        took = perf_counter() - time_start
+        duration = perf_counter() - time_start
        if popen.was_killed or popen.poll() is None:
            result = 'TIMEOUT'
        else:
@@ -402,19 +403,18 @@ def run(command, **kwargs): # pylint:disable=too-many-locals
        out = out.rstrip()
        out += '\n'
        log('| %s\n%s', name, out)
-    status, run_count, skipped_count = _find_test_status(took, out)
+    status, run_count, skipped_count = _find_test_status(duration, out)
    if result:
        log('! %s [code %s] %s', name, result, status, color='error')
    elif not nested:
        log('- %s %s', name, status)
-    if took >= MIN_RUNTIME:
-        runtimelog.append((-took, name))
    return RunResult(
        command, kwargs, result,
        output=out, error=err,
        name=name,
        run_count=run_count,
-        skipped_count=skipped_count
+        skipped_count=skipped_count,
+        run_duration=duration,
    )