Commit 94459fd7 authored by Thomas Moreau's avatar Thomas Moreau Committed by Antoine Pitrou

bpo-31699 Deadlocks in `concurrent.futures.ProcessPoolExecutor` with pickling error (#3895)

Fix deadlocks in :class:`concurrent.futures.ProcessPoolExecutor` when task arguments or results cause pickling or unpickling errors.
This should make sure that calls to the :class:`ProcessPoolExecutor` API always eventually return.
parent 65f2a6dc
...@@ -8,10 +8,10 @@ The follow diagram and text describe the data-flow through the system: ...@@ -8,10 +8,10 @@ The follow diagram and text describe the data-flow through the system:
|======================= In-process =====================|== Out-of-process ==| |======================= In-process =====================|== Out-of-process ==|
+----------+ +----------+ +--------+ +-----------+ +---------+ +----------+ +----------+ +--------+ +-----------+ +---------+
| | => | Work Ids | => | | => | Call Q | => | | | | => | Work Ids | | | | Call Q | | Process |
| | +----------+ | | +-----------+ | | | | +----------+ | | +-----------+ | Pool |
| | | ... | | | | ... | | | | | | ... | | | | ... | +---------+
| | | 6 | | | | 5, call() | | | | | | 6 | => | | => | 5, call() | => | |
| | | 7 | | | | ... | | | | | | 7 | | | | ... | | |
| Process | | ... | | Local | +-----------+ | Process | | Process | | ... | | Local | +-----------+ | Process |
| Pool | +----------+ | Worker | | #1..n | | Pool | +----------+ | Worker | | #1..n |
...@@ -52,6 +52,7 @@ import queue ...@@ -52,6 +52,7 @@ import queue
from queue import Full from queue import Full
import multiprocessing as mp import multiprocessing as mp
from multiprocessing.connection import wait from multiprocessing.connection import wait
from multiprocessing.queues import Queue
import threading import threading
import weakref import weakref
from functools import partial from functools import partial
...@@ -72,16 +73,31 @@ import traceback ...@@ -72,16 +73,31 @@ import traceback
# workers to exit when their work queues are empty and then waits until the # workers to exit when their work queues are empty and then waits until the
# threads/processes finish. # threads/processes finish.
_threads_queues = weakref.WeakKeyDictionary() _threads_wakeups = weakref.WeakKeyDictionary()
_global_shutdown = False _global_shutdown = False
class _ThreadWakeup:
__slot__ = ["_state"]
def __init__(self):
self._reader, self._writer = mp.Pipe(duplex=False)
def wakeup(self):
self._writer.send_bytes(b"")
def clear(self):
while self._reader.poll():
self._reader.recv_bytes()
def _python_exit(): def _python_exit():
global _global_shutdown global _global_shutdown
_global_shutdown = True _global_shutdown = True
items = list(_threads_queues.items()) items = list(_threads_wakeups.items())
for t, q in items: for _, thread_wakeup in items:
q.put(None) thread_wakeup.wakeup()
for t, q in items: for t, _ in items:
t.join() t.join()
# Controls how many more calls than processes will be queued in the call queue. # Controls how many more calls than processes will be queued in the call queue.
...@@ -90,6 +106,7 @@ def _python_exit(): ...@@ -90,6 +106,7 @@ def _python_exit():
# (Futures in the call queue cannot be cancelled). # (Futures in the call queue cannot be cancelled).
EXTRA_QUEUED_CALLS = 1 EXTRA_QUEUED_CALLS = 1
# Hack to embed stringification of remote traceback in local traceback # Hack to embed stringification of remote traceback in local traceback
class _RemoteTraceback(Exception): class _RemoteTraceback(Exception):
...@@ -132,6 +149,25 @@ class _CallItem(object): ...@@ -132,6 +149,25 @@ class _CallItem(object):
self.kwargs = kwargs self.kwargs = kwargs
class _SafeQueue(Queue):
"""Safe Queue set exception to the future object linked to a job"""
def __init__(self, max_size=0, *, ctx, pending_work_items):
self.pending_work_items = pending_work_items
super().__init__(max_size, ctx=ctx)
def _on_queue_feeder_error(self, e, obj):
if isinstance(obj, _CallItem):
tb = traceback.format_exception(type(e), e, e.__traceback__)
e.__cause__ = _RemoteTraceback('\n"""\n{}"""'.format(''.join(tb)))
work_item = self.pending_work_items.pop(obj.work_id, None)
# work_item can be None if another process terminated. In this case,
# the queue_manager_thread fails all work_items with BrokenProcessPool
if work_item is not None:
work_item.future.set_exception(e)
else:
super()._on_queue_feeder_error(e, obj)
def _get_chunks(*iterables, chunksize): def _get_chunks(*iterables, chunksize):
""" Iterates over zip()ed iterables in chunks. """ """ Iterates over zip()ed iterables in chunks. """
it = zip(*iterables) it = zip(*iterables)
...@@ -152,6 +188,17 @@ def _process_chunk(fn, chunk): ...@@ -152,6 +188,17 @@ def _process_chunk(fn, chunk):
""" """
return [fn(*args) for args in chunk] return [fn(*args) for args in chunk]
def _sendback_result(result_queue, work_id, result=None, exception=None):
"""Safely send back the given result or exception"""
try:
result_queue.put(_ResultItem(work_id, result=result,
exception=exception))
except BaseException as e:
exc = _ExceptionWithTraceback(e, e.__traceback__)
result_queue.put(_ResultItem(work_id, exception=exc))
def _process_worker(call_queue, result_queue, initializer, initargs): def _process_worker(call_queue, result_queue, initializer, initargs):
"""Evaluates calls from call_queue and places the results in result_queue. """Evaluates calls from call_queue and places the results in result_queue.
...@@ -183,10 +230,9 @@ def _process_worker(call_queue, result_queue, initializer, initargs): ...@@ -183,10 +230,9 @@ def _process_worker(call_queue, result_queue, initializer, initargs):
r = call_item.fn(*call_item.args, **call_item.kwargs) r = call_item.fn(*call_item.args, **call_item.kwargs)
except BaseException as e: except BaseException as e:
exc = _ExceptionWithTraceback(e, e.__traceback__) exc = _ExceptionWithTraceback(e, e.__traceback__)
result_queue.put(_ResultItem(call_item.work_id, exception=exc)) _sendback_result(result_queue, call_item.work_id, exception=exc)
else: else:
result_queue.put(_ResultItem(call_item.work_id, _sendback_result(result_queue, call_item.work_id, result=r)
result=r))
# Liberate the resource as soon as possible, to avoid holding onto # Liberate the resource as soon as possible, to avoid holding onto
# open files or shared memory that is not needed anymore # open files or shared memory that is not needed anymore
...@@ -230,12 +276,14 @@ def _add_call_item_to_queue(pending_work_items, ...@@ -230,12 +276,14 @@ def _add_call_item_to_queue(pending_work_items,
del pending_work_items[work_id] del pending_work_items[work_id]
continue continue
def _queue_management_worker(executor_reference, def _queue_management_worker(executor_reference,
processes, processes,
pending_work_items, pending_work_items,
work_ids_queue, work_ids_queue,
call_queue, call_queue,
result_queue): result_queue,
thread_wakeup):
"""Manages the communication between this process and the worker processes. """Manages the communication between this process and the worker processes.
This function is run in a local thread. This function is run in a local thread.
...@@ -253,6 +301,9 @@ def _queue_management_worker(executor_reference, ...@@ -253,6 +301,9 @@ def _queue_management_worker(executor_reference,
derived from _WorkItems for processing by the process workers. derived from _WorkItems for processing by the process workers.
result_queue: A ctx.SimpleQueue of _ResultItems generated by the result_queue: A ctx.SimpleQueue of _ResultItems generated by the
process workers. process workers.
thread_wakeup: A _ThreadWakeup to allow waking up the
queue_manager_thread from the main Thread and avoid deadlocks
caused by permanently locked queues.
""" """
executor = None executor = None
...@@ -261,10 +312,21 @@ def _queue_management_worker(executor_reference, ...@@ -261,10 +312,21 @@ def _queue_management_worker(executor_reference,
or executor._shutdown_thread) or executor._shutdown_thread)
def shutdown_worker(): def shutdown_worker():
# This is an upper bound # This is an upper bound on the number of children alive.
nb_children_alive = sum(p.is_alive() for p in processes.values()) n_children_alive = sum(p.is_alive() for p in processes.values())
for i in range(0, nb_children_alive): n_children_to_stop = n_children_alive
call_queue.put_nowait(None) n_sentinels_sent = 0
# Send the right number of sentinels, to make sure all children are
# properly terminated.
while n_sentinels_sent < n_children_to_stop and n_children_alive > 0:
for i in range(n_children_to_stop - n_sentinels_sent):
try:
call_queue.put_nowait(None)
n_sentinels_sent += 1
except Full:
break
n_children_alive = sum(p.is_alive() for p in processes.values())
# Release the queue's resources as soon as possible. # Release the queue's resources as soon as possible.
call_queue.close() call_queue.close()
# If .join() is not called on the created processes then # If .join() is not called on the created processes then
...@@ -272,19 +334,37 @@ def _queue_management_worker(executor_reference, ...@@ -272,19 +334,37 @@ def _queue_management_worker(executor_reference,
for p in processes.values(): for p in processes.values():
p.join() p.join()
reader = result_queue._reader result_reader = result_queue._reader
wakeup_reader = thread_wakeup._reader
readers = [result_reader, wakeup_reader]
while True: while True:
_add_call_item_to_queue(pending_work_items, _add_call_item_to_queue(pending_work_items,
work_ids_queue, work_ids_queue,
call_queue) call_queue)
sentinels = [p.sentinel for p in processes.values()] # Wait for a result to be ready in the result_queue while checking
assert sentinels # that all worker processes are still running, or for a wake up
ready = wait([reader] + sentinels) # signal send. The wake up signals come either from new tasks being
if reader in ready: # submitted, from the executor being shutdown/gc-ed, or from the
result_item = reader.recv() # shutdown of the python interpreter.
else: worker_sentinels = [p.sentinel for p in processes.values()]
ready = wait(readers + worker_sentinels)
cause = None
is_broken = True
if result_reader in ready:
try:
result_item = result_reader.recv()
is_broken = False
except BaseException as e:
cause = traceback.format_exception(type(e), e, e.__traceback__)
elif wakeup_reader in ready:
is_broken = False
result_item = None
thread_wakeup.clear()
if is_broken:
# Mark the process pool broken so that submits fail right now. # Mark the process pool broken so that submits fail right now.
executor = executor_reference() executor = executor_reference()
if executor is not None: if executor is not None:
...@@ -293,14 +373,15 @@ def _queue_management_worker(executor_reference, ...@@ -293,14 +373,15 @@ def _queue_management_worker(executor_reference,
'usable anymore') 'usable anymore')
executor._shutdown_thread = True executor._shutdown_thread = True
executor = None executor = None
bpe = BrokenProcessPool("A process in the process pool was "
"terminated abruptly while the future was "
"running or pending.")
if cause is not None:
bpe.__cause__ = _RemoteTraceback(
f"\n'''\n{''.join(cause)}'''")
# All futures in flight must be marked failed # All futures in flight must be marked failed
for work_id, work_item in pending_work_items.items(): for work_id, work_item in pending_work_items.items():
work_item.future.set_exception( work_item.future.set_exception(bpe)
BrokenProcessPool(
"A process in the process pool was "
"terminated abruptly while the future was "
"running or pending."
))
# Delete references to object. See issue16284 # Delete references to object. See issue16284
del work_item del work_item
pending_work_items.clear() pending_work_items.clear()
...@@ -329,6 +410,9 @@ def _queue_management_worker(executor_reference, ...@@ -329,6 +410,9 @@ def _queue_management_worker(executor_reference,
work_item.future.set_result(result_item.result) work_item.future.set_result(result_item.result)
# Delete references to object. See issue16284 # Delete references to object. See issue16284
del work_item del work_item
# Delete reference to result_item
del result_item
# Check whether we should start shutting down. # Check whether we should start shutting down.
executor = executor_reference() executor = executor_reference()
# No more work items can be added if: # No more work items can be added if:
...@@ -348,8 +432,11 @@ def _queue_management_worker(executor_reference, ...@@ -348,8 +432,11 @@ def _queue_management_worker(executor_reference,
pass pass
executor = None executor = None
_system_limits_checked = False _system_limits_checked = False
_system_limited = None _system_limited = None
def _check_system_limits(): def _check_system_limits():
global _system_limits_checked, _system_limited global _system_limits_checked, _system_limited
if _system_limits_checked: if _system_limits_checked:
...@@ -369,7 +456,8 @@ def _check_system_limits(): ...@@ -369,7 +456,8 @@ def _check_system_limits():
# minimum number of semaphores available # minimum number of semaphores available
# according to POSIX # according to POSIX
return return
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max _system_limited = ("system provides too few semaphores (%d"
" available, 256 necessary)" % nsems_max)
raise NotImplementedError(_system_limited) raise NotImplementedError(_system_limited)
...@@ -415,6 +503,7 @@ class ProcessPoolExecutor(_base.Executor): ...@@ -415,6 +503,7 @@ class ProcessPoolExecutor(_base.Executor):
raise ValueError("max_workers must be greater than 0") raise ValueError("max_workers must be greater than 0")
self._max_workers = max_workers self._max_workers = max_workers
if mp_context is None: if mp_context is None:
mp_context = mp.get_context() mp_context = mp.get_context()
self._mp_context = mp_context self._mp_context = mp_context
...@@ -424,34 +513,52 @@ class ProcessPoolExecutor(_base.Executor): ...@@ -424,34 +513,52 @@ class ProcessPoolExecutor(_base.Executor):
self._initializer = initializer self._initializer = initializer
self._initargs = initargs self._initargs = initargs
# Management thread
self._queue_management_thread = None
# Map of pids to processes
self._processes = {}
# Shutdown is a two-step process.
self._shutdown_thread = False
self._shutdown_lock = threading.Lock()
self._broken = False
self._queue_count = 0
self._pending_work_items = {}
# Create communication channels for the executor
# Make the call queue slightly larger than the number of processes to # Make the call queue slightly larger than the number of processes to
# prevent the worker processes from idling. But don't make it too big # prevent the worker processes from idling. But don't make it too big
# because futures in the call queue cannot be cancelled. # because futures in the call queue cannot be cancelled.
queue_size = self._max_workers + EXTRA_QUEUED_CALLS queue_size = self._max_workers + EXTRA_QUEUED_CALLS
self._call_queue = mp_context.Queue(queue_size) self._call_queue = _SafeQueue(
max_size=queue_size, ctx=self._mp_context,
pending_work_items=self._pending_work_items)
# Killed worker processes can produce spurious "broken pipe" # Killed worker processes can produce spurious "broken pipe"
# tracebacks in the queue's own worker thread. But we detect killed # tracebacks in the queue's own worker thread. But we detect killed
# processes anyway, so silence the tracebacks. # processes anyway, so silence the tracebacks.
self._call_queue._ignore_epipe = True self._call_queue._ignore_epipe = True
self._result_queue = mp_context.SimpleQueue() self._result_queue = mp_context.SimpleQueue()
self._work_ids = queue.Queue() self._work_ids = queue.Queue()
self._queue_management_thread = None
# Map of pids to processes
self._processes = {}
# Shutdown is a two-step process. # _ThreadWakeup is a communication channel used to interrupt the wait
self._shutdown_thread = False # of the main loop of queue_manager_thread from another thread (e.g.
self._shutdown_lock = threading.Lock() # when calling executor.submit or executor.shutdown). We do not use the
self._broken = False # _result_queue to send the wakeup signal to the queue_manager_thread
self._queue_count = 0 # as it could result in a deadlock if a worker process dies with the
self._pending_work_items = {} # _result_queue write lock still acquired.
self._queue_management_thread_wakeup = _ThreadWakeup()
def _start_queue_management_thread(self): def _start_queue_management_thread(self):
# When the executor gets lost, the weakref callback will wake up
# the queue management thread.
def weakref_cb(_, q=self._result_queue):
q.put(None)
if self._queue_management_thread is None: if self._queue_management_thread is None:
# When the executor gets garbarge collected, the weakref callback
# will wake up the queue management thread so that it can terminate
# if there is no pending work item.
def weakref_cb(_,
thread_wakeup=self._queue_management_thread_wakeup):
mp.util.debug('Executor collected: triggering callback for'
' QueueManager wakeup')
thread_wakeup.wakeup()
# Start the processes so that their sentinels are known. # Start the processes so that their sentinels are known.
self._adjust_process_count() self._adjust_process_count()
self._queue_management_thread = threading.Thread( self._queue_management_thread = threading.Thread(
...@@ -461,10 +568,13 @@ class ProcessPoolExecutor(_base.Executor): ...@@ -461,10 +568,13 @@ class ProcessPoolExecutor(_base.Executor):
self._pending_work_items, self._pending_work_items,
self._work_ids, self._work_ids,
self._call_queue, self._call_queue,
self._result_queue)) self._result_queue,
self._queue_management_thread_wakeup),
name="QueueManagerThread")
self._queue_management_thread.daemon = True self._queue_management_thread.daemon = True
self._queue_management_thread.start() self._queue_management_thread.start()
_threads_queues[self._queue_management_thread] = self._result_queue _threads_wakeups[self._queue_management_thread] = \
self._queue_management_thread_wakeup
def _adjust_process_count(self): def _adjust_process_count(self):
for _ in range(len(self._processes), self._max_workers): for _ in range(len(self._processes), self._max_workers):
...@@ -491,7 +601,7 @@ class ProcessPoolExecutor(_base.Executor): ...@@ -491,7 +601,7 @@ class ProcessPoolExecutor(_base.Executor):
self._work_ids.put(self._queue_count) self._work_ids.put(self._queue_count)
self._queue_count += 1 self._queue_count += 1
# Wake up queue management thread # Wake up queue management thread
self._result_queue.put(None) self._queue_management_thread_wakeup.wakeup()
self._start_queue_management_thread() self._start_queue_management_thread()
return f return f
...@@ -531,7 +641,7 @@ class ProcessPoolExecutor(_base.Executor): ...@@ -531,7 +641,7 @@ class ProcessPoolExecutor(_base.Executor):
self._shutdown_thread = True self._shutdown_thread = True
if self._queue_management_thread: if self._queue_management_thread:
# Wake up queue management thread # Wake up queue management thread
self._result_queue.put(None) self._queue_management_thread_wakeup.wakeup()
if wait: if wait:
self._queue_management_thread.join() self._queue_management_thread.join()
# To reduce the risk of opening too many files, remove references to # To reduce the risk of opening too many files, remove references to
......
...@@ -160,9 +160,10 @@ class Queue(object): ...@@ -160,9 +160,10 @@ class Queue(object):
self._thread = threading.Thread( self._thread = threading.Thread(
target=Queue._feed, target=Queue._feed,
args=(self._buffer, self._notempty, self._send_bytes, args=(self._buffer, self._notempty, self._send_bytes,
self._wlock, self._writer.close, self._ignore_epipe), self._wlock, self._writer.close, self._ignore_epipe,
self._on_queue_feeder_error),
name='QueueFeederThread' name='QueueFeederThread'
) )
self._thread.daemon = True self._thread.daemon = True
debug('doing self._thread.start()') debug('doing self._thread.start()')
...@@ -201,7 +202,8 @@ class Queue(object): ...@@ -201,7 +202,8 @@ class Queue(object):
notempty.notify() notempty.notify()
@staticmethod @staticmethod
def _feed(buffer, notempty, send_bytes, writelock, close, ignore_epipe): def _feed(buffer, notempty, send_bytes, writelock, close, ignore_epipe,
onerror):
debug('starting thread to feed data to pipe') debug('starting thread to feed data to pipe')
nacquire = notempty.acquire nacquire = notempty.acquire
nrelease = notempty.release nrelease = notempty.release
...@@ -253,8 +255,17 @@ class Queue(object): ...@@ -253,8 +255,17 @@ class Queue(object):
info('error in queue thread: %s', e) info('error in queue thread: %s', e)
return return
else: else:
import traceback onerror(e, obj)
traceback.print_exc()
@staticmethod
def _on_queue_feeder_error(e, obj):
"""
Private API hook called when feeding data in the background thread
raises an exception. For overriding by concurrent.futures.
"""
import traceback
traceback.print_exc()
_sentinel = object() _sentinel = object()
......
...@@ -1029,6 +1029,43 @@ class _TestQueue(BaseTestCase): ...@@ -1029,6 +1029,43 @@ class _TestQueue(BaseTestCase):
self.assertTrue(q.get(timeout=1.0)) self.assertTrue(q.get(timeout=1.0))
close_queue(q) close_queue(q)
def test_queue_feeder_on_queue_feeder_error(self):
# bpo-30006: verify feeder handles exceptions using the
# _on_queue_feeder_error hook.
if self.TYPE != 'processes':
self.skipTest('test not appropriate for {}'.format(self.TYPE))
class NotSerializable(object):
"""Mock unserializable object"""
def __init__(self):
self.reduce_was_called = False
self.on_queue_feeder_error_was_called = False
def __reduce__(self):
self.reduce_was_called = True
raise AttributeError
class SafeQueue(multiprocessing.queues.Queue):
"""Queue with overloaded _on_queue_feeder_error hook"""
@staticmethod
def _on_queue_feeder_error(e, obj):
if (isinstance(e, AttributeError) and
isinstance(obj, NotSerializable)):
obj.on_queue_feeder_error_was_called = True
not_serializable_obj = NotSerializable()
# The captured_stderr reduces the noise in the test report
with test.support.captured_stderr():
q = SafeQueue(ctx=multiprocessing.get_context())
q.put(not_serializable_obj)
# Verify that q is still functionning correctly
q.put(True)
self.assertTrue(q.get(timeout=1.0))
# Assert that the serialization and the hook have been called correctly
self.assertTrue(not_serializable_obj.reduce_was_called)
self.assertTrue(not_serializable_obj.on_queue_feeder_error_was_called)
# #
# #
# #
......
...@@ -18,6 +18,7 @@ import threading ...@@ -18,6 +18,7 @@ import threading
import time import time
import unittest import unittest
import weakref import weakref
from pickle import PicklingError
from concurrent import futures from concurrent import futures
from concurrent.futures._base import ( from concurrent.futures._base import (
...@@ -394,16 +395,17 @@ class ProcessPoolShutdownTest(ExecutorShutdownTest): ...@@ -394,16 +395,17 @@ class ProcessPoolShutdownTest(ExecutorShutdownTest):
queue_management_thread = executor._queue_management_thread queue_management_thread = executor._queue_management_thread
processes = executor._processes processes = executor._processes
call_queue = executor._call_queue call_queue = executor._call_queue
queue_management_thread = executor._queue_management_thread
del executor del executor
# Make sure that all the executor ressources were properly cleaned by
# the shutdown process
queue_management_thread.join() queue_management_thread.join()
for p in processes.values(): for p in processes.values():
p.join() p.join()
call_queue.close()
call_queue.join_thread() call_queue.join_thread()
create_executor_tests(ProcessPoolShutdownTest, create_executor_tests(ProcessPoolShutdownTest,
executor_mixins=(ProcessPoolForkMixin, executor_mixins=(ProcessPoolForkMixin,
ProcessPoolForkserverMixin, ProcessPoolForkserverMixin,
...@@ -784,6 +786,172 @@ create_executor_tests(ProcessPoolExecutorTest, ...@@ -784,6 +786,172 @@ create_executor_tests(ProcessPoolExecutorTest,
ProcessPoolForkserverMixin, ProcessPoolForkserverMixin,
ProcessPoolSpawnMixin)) ProcessPoolSpawnMixin))
def hide_process_stderr():
import io
sys.stderr = io.StringIO()
def _crash(delay=None):
"""Induces a segfault."""
if delay:
time.sleep(delay)
import faulthandler
faulthandler.disable()
faulthandler._sigsegv()
def _exit():
"""Induces a sys exit with exitcode 1."""
sys.exit(1)
def _raise_error(Err):
"""Function that raises an Exception in process."""
hide_process_stderr()
raise Err()
def _return_instance(cls):
"""Function that returns a instance of cls."""
hide_process_stderr()
return cls()
class CrashAtPickle(object):
"""Bad object that triggers a segfault at pickling time."""
def __reduce__(self):
_crash()
class CrashAtUnpickle(object):
"""Bad object that triggers a segfault at unpickling time."""
def __reduce__(self):
return _crash, ()
class ExitAtPickle(object):
"""Bad object that triggers a process exit at pickling time."""
def __reduce__(self):
_exit()
class ExitAtUnpickle(object):
"""Bad object that triggers a process exit at unpickling time."""
def __reduce__(self):
return _exit, ()
class ErrorAtPickle(object):
"""Bad object that triggers an error at pickling time."""
def __reduce__(self):
from pickle import PicklingError
raise PicklingError("Error in pickle")
class ErrorAtUnpickle(object):
"""Bad object that triggers an error at unpickling time."""
def __reduce__(self):
from pickle import UnpicklingError
return _raise_error, (UnpicklingError, )
class ExecutorDeadlockTest:
TIMEOUT = 15
@classmethod
def _sleep_id(cls, x, delay):
time.sleep(delay)
return x
def _fail_on_deadlock(self, executor):
# If we did not recover before TIMEOUT seconds, consider that the
# executor is in a deadlock state and forcefully clean all its
# composants.
import faulthandler
from tempfile import TemporaryFile
with TemporaryFile(mode="w+") as f:
faulthandler.dump_traceback(file=f)
f.seek(0)
tb = f.read()
for p in executor._processes.values():
p.terminate()
# This should be safe to call executor.shutdown here as all possible
# deadlocks should have been broken.
executor.shutdown(wait=True)
print(f"\nTraceback:\n {tb}", file=sys.__stderr__)
self.fail(f"Executor deadlock:\n\n{tb}")
def test_crash(self):
# extensive testing for deadlock caused by crashes in a pool.
self.executor.shutdown(wait=True)
crash_cases = [
# Check problem occuring while pickling a task in
# the task_handler thread
(id, (ErrorAtPickle(),), PicklingError, "error at task pickle"),
# Check problem occuring while unpickling a task on workers
(id, (ExitAtUnpickle(),), BrokenProcessPool,
"exit at task unpickle"),
(id, (ErrorAtUnpickle(),), BrokenProcessPool,
"error at task unpickle"),
(id, (CrashAtUnpickle(),), BrokenProcessPool,
"crash at task unpickle"),
# Check problem occuring during func execution on workers
(_crash, (), BrokenProcessPool,
"crash during func execution on worker"),
(_exit, (), SystemExit,
"exit during func execution on worker"),
(_raise_error, (RuntimeError, ), RuntimeError,
"error during func execution on worker"),
# Check problem occuring while pickling a task result
# on workers
(_return_instance, (CrashAtPickle,), BrokenProcessPool,
"crash during result pickle on worker"),
(_return_instance, (ExitAtPickle,), SystemExit,
"exit during result pickle on worker"),
(_return_instance, (ErrorAtPickle,), PicklingError,
"error during result pickle on worker"),
# Check problem occuring while unpickling a task in
# the result_handler thread
(_return_instance, (ErrorAtUnpickle,), BrokenProcessPool,
"error during result unpickle in result_handler"),
(_return_instance, (ExitAtUnpickle,), BrokenProcessPool,
"exit during result unpickle in result_handler")
]
for func, args, error, name in crash_cases:
with self.subTest(name):
# The captured_stderr reduces the noise in the test report
with test.support.captured_stderr():
executor = self.executor_type(
max_workers=2, mp_context=get_context(self.ctx))
res = executor.submit(func, *args)
with self.assertRaises(error):
try:
res.result(timeout=self.TIMEOUT)
except futures.TimeoutError:
# If we did not recover before TIMEOUT seconds,
# consider that the executor is in a deadlock state
self._fail_on_deadlock(executor)
executor.shutdown(wait=True)
def test_shutdown_deadlock(self):
# Test that the pool calling shutdown do not cause deadlock
# if a worker fails after the shutdown call.
self.executor.shutdown(wait=True)
with self.executor_type(max_workers=2,
mp_context=get_context(self.ctx)) as executor:
self.executor = executor # Allow clean up in fail_on_deadlock
f = executor.submit(_crash, delay=.1)
executor.shutdown(wait=True)
with self.assertRaises(BrokenProcessPool):
f.result()
create_executor_tests(ExecutorDeadlockTest,
executor_mixins=(ProcessPoolForkMixin,
ProcessPoolForkserverMixin,
ProcessPoolSpawnMixin))
class FutureTests(BaseTestCase): class FutureTests(BaseTestCase):
def test_done_callback_with_result(self): def test_done_callback_with_result(self):
......
Fix deadlocks in :class:`concurrent.futures.ProcessPoolExecutor` when
task arguments or results cause pickling or unpickling errors.
This should make sure that calls to the :class:`ProcessPoolExecutor` API
always eventually return.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment