Commit 0ec618af authored by Victor Stinner's avatar Victor Stinner Committed by GitHub

bpo-37531: regrtest ignores output on timeout (GH-16659)

bpo-37531, bpo-38207: On timeout, regrtest no longer attempts to call
`popen.communicate() again: it can hang until all child processes
using stdout and stderr pipes completes. Kill the worker process and
ignores its output.

Reenable test_regrtest.test_multiprocessing_timeout().

bpo-37531: Change also the faulthandler timeout of the main process
from 1 minute to 5 minutes, for Python slowest buildbots.
parent e53c5800
...@@ -22,6 +22,12 @@ from test.libregrtest.utils import format_duration, print_warning ...@@ -22,6 +22,12 @@ from test.libregrtest.utils import format_duration, print_warning
PROGRESS_UPDATE = 30.0 # seconds PROGRESS_UPDATE = 30.0 # seconds
assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
# Kill the main process after 5 minutes. It is supposed to write an update
# every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
# buildbot workers.
MAIN_PROCESS_TIMEOUT = 5 * 60.0
assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
# Time to wait until a worker completes: should be immediate # Time to wait until a worker completes: should be immediate
JOIN_TIMEOUT = 30.0 # seconds JOIN_TIMEOUT = 30.0 # seconds
...@@ -160,20 +166,6 @@ class TestWorkerProcess(threading.Thread): ...@@ -160,20 +166,6 @@ class TestWorkerProcess(threading.Thread):
result = TestResult(test_name, error_type, test_time, None) result = TestResult(test_name, error_type, test_time, None)
return MultiprocessResult(result, stdout, stderr, err_msg) return MultiprocessResult(result, stdout, stderr, err_msg)
def _timedout(self, test_name):
self._kill()
stdout = stderr = ''
popen = self._popen
try:
stdout, stderr = popen.communicate(timeout=JOIN_TIMEOUT)
except (subprocess.TimeoutExpired, OSError) as exc:
print_warning(f"Failed to read {self} output "
f"(timeout={format_duration(JOIN_TIMEOUT)}): "
f"{exc!r}")
return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
def _run_process(self, test_name): def _run_process(self, test_name):
self.start_time = time.monotonic() self.start_time = time.monotonic()
...@@ -197,23 +189,32 @@ class TestWorkerProcess(threading.Thread): ...@@ -197,23 +189,32 @@ class TestWorkerProcess(threading.Thread):
try: try:
stdout, stderr = popen.communicate(timeout=self.timeout) stdout, stderr = popen.communicate(timeout=self.timeout)
retcode = popen.returncode
assert retcode is not None
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
if self._stopped: if self._stopped:
# kill() has been called: communicate() fails # kill() has been called: communicate() fails
# on reading closed stdout/stderr # on reading closed stdout/stderr
raise ExitThread raise ExitThread
return self._timedout(test_name) # On timeout, kill the process
self._kill()
# None means TIMEOUT for the caller
retcode = None
# bpo-38207: Don't attempt to call communicate() again: on it
# can hang until all child processes using stdout and stderr
# pipes completes.
stdout = stderr = ''
except OSError: except OSError:
if self._stopped: if self._stopped:
# kill() has been called: communicate() fails # kill() has been called: communicate() fails
# on reading closed stdout/stderr # on reading closed stdout/stderr
raise ExitThread raise ExitThread
raise raise
else:
retcode = popen.returncode stdout = stdout.strip()
stdout = stdout.strip() stderr = stderr.rstrip()
stderr = stderr.rstrip()
return (retcode, stdout, stderr) return (retcode, stdout, stderr)
except: except:
...@@ -225,13 +226,10 @@ class TestWorkerProcess(threading.Thread): ...@@ -225,13 +226,10 @@ class TestWorkerProcess(threading.Thread):
self.current_test_name = None self.current_test_name = None
def _runtest(self, test_name): def _runtest(self, test_name):
result = self._run_process(test_name) retcode, stdout, stderr = self._run_process(test_name)
if isinstance(result, MultiprocessResult):
# _timedout() case
return result
retcode, stdout, stderr = result if retcode is None:
return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
err_msg = None err_msg = None
if retcode != 0: if retcode != 0:
...@@ -250,7 +248,8 @@ class TestWorkerProcess(threading.Thread): ...@@ -250,7 +248,8 @@ class TestWorkerProcess(threading.Thread):
err_msg = "Failed to parse worker JSON: %s" % exc err_msg = "Failed to parse worker JSON: %s" % exc
if err_msg is not None: if err_msg is not None:
return self.mp_result_error(test_name, CHILD_ERROR, stdout, stderr, err_msg) return self.mp_result_error(test_name, CHILD_ERROR,
stdout, stderr, err_msg)
return MultiprocessResult(result, stdout, stderr, err_msg) return MultiprocessResult(result, stdout, stderr, err_msg)
...@@ -289,6 +288,16 @@ class TestWorkerProcess(threading.Thread): ...@@ -289,6 +288,16 @@ class TestWorkerProcess(threading.Thread):
f"{exc!r}") f"{exc!r}")
def wait_stopped(self, start_time): def wait_stopped(self, start_time):
# bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
# which killed the process. Sometimes, killing the process from the
# main thread does not interrupt popen.communicate() in
# TestWorkerProcess thread. This loop with a timeout is a workaround
# for that.
#
# Moreover, if this method fails to join the thread, it is likely
# that Python will hang at exit while calling threading._shutdown()
# which tries again to join the blocked thread. Regrtest.main()
# uses EXIT_TIMEOUT to workaround this second bug.
while True: while True:
# Write a message every second # Write a message every second
self.join(1.0) self.join(1.0)
...@@ -355,7 +364,8 @@ class MultiprocessTestRunner: ...@@ -355,7 +364,8 @@ class MultiprocessTestRunner:
timeout = PROGRESS_UPDATE timeout = PROGRESS_UPDATE
while True: while True:
if use_faulthandler: if use_faulthandler:
faulthandler.dump_traceback_later(timeout * 2.0, exit=True) faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
exit=True)
# wait for a thread # wait for a thread
try: try:
......
...@@ -1153,7 +1153,6 @@ class ArgsTestCase(BaseTestCase): ...@@ -1153,7 +1153,6 @@ class ArgsTestCase(BaseTestCase):
env_changed=[testname], env_changed=[testname],
fail_env_changed=True) fail_env_changed=True)
@unittest.skipIf(True, 'bpo-37531, bpo-38207: test hangs randomly')
def test_multiprocessing_timeout(self): def test_multiprocessing_timeout(self):
code = textwrap.dedent(r""" code = textwrap.dedent(r"""
import time import time
......
On timeout, regrtest no longer attempts to call ``popen.communicate()``
again: it can hang until all child processes using stdout and stderr pipes
completes. Kill the worker process and ignores its output. Change also the
faulthandler timeout of the main process from 1 minute to 5 minutes, for Python
slowest buildbots.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment