bpo-6721: Hold logging locks across fork() (GH-4071)

bpo-6721: When os.fork() was called while another thread holds a logging lock, the child process may deadlock when it tries to log. This fixes that by acquiring all logging locks before fork and releasing them afterwards. A regression test that fails before this change is included. Within the new unittest itself: There is a small _potential_ due to mixing of fork and a thread in the child process if the parent's thread happened to hold a non-reentrant library call lock (malloc?) when the os.fork() happens. buildbots and time will tell if this actually manifests itself in this test or not. :/ A functionality test that avoids that would be a challenge. An alternate test that isn't trying to produce the deadlock itself but just checking that the release and acquire calls are made would be the next best alternative if so.

bpo-6721: Hold logging locks across fork() (GH-4071)
bpo-6721: When os.fork() was called while another thread holds a logging lock, the child process may deadlock when it tries to log. This fixes that by acquiring all logging locks before fork and releasing them afterwards. A regression test that fails before this change is included. Within the new unittest itself: There is a small _potential_ due to mixing of fork and a thread in the child process if the parent's thread happened to hold a non-reentrant library call lock (malloc?) when the os.fork() happens. buildbots and time will tell if this actually manifests itself in this test or not. :/ A functionality test that avoids that would be a challenge. An alternate test that isn't trying to produce the deadlock itself but just checking that the release and acquire calls are made would be the next best alternative if so.
19003841 · Gregory P. Smith · GitHub · ea13740a · 19003841 · 19003841
Commit 19003841 authored Sep 13, 2018 by Gregory P. Smith Committed by GitHub Sep 13, 2018
3 changed files
--- a/Lib/logging/__init__.py
+++ b/Lib/logging/__init__.py
@@ -225,6 +225,55 @@ def _releaseLock():
    if _lock:
        _lock.release()

+
+# Prevent a held logging lock from blocking a child from logging.
+
+if not hasattr(os, 'register_at_fork'):  # Windows and friends.
+    def _register_at_fork_acquire_release(instance):
+        pass  # no-op when os.register_at_fork does not exist.
+else:  # The os.register_at_fork API exists
+    os.register_at_fork(before=_acquireLock,
+                        after_in_child=_releaseLock,
+                        after_in_parent=_releaseLock)
+
+    # A collection of instances with acquire and release methods (logging.Handler)
+    # to be called before and after fork.  The weakref avoids us keeping discarded
+    # Handler instances alive forever in case an odd program creates and destroys
+    # many over its lifetime.
+    _at_fork_acquire_release_weakset = weakref.WeakSet()
+
+
+    def _register_at_fork_acquire_release(instance):
+        # We put the instance itself in a single WeakSet as we MUST have only
+        # one atomic weak ref. used by both before and after atfork calls to
+        # guarantee matched pairs of acquire and release calls.
+        _at_fork_acquire_release_weakset.add(instance)
+
+
+    def _at_fork_weak_calls(method_name):
+        for instance in _at_fork_acquire_release_weakset:
+            method = getattr(instance, method_name)
+            try:
+                method()
+            except Exception as err:
+                # Similar to what PyErr_WriteUnraisable does.
+                print("Ignoring exception from logging atfork", instance,
+                      method_name, "method:", err, file=sys.stderr)
+
+
+    def _before_at_fork_weak_calls():
+        _at_fork_weak_calls('acquire')
+
+
+    def _after_at_fork_weak_calls():
+        _at_fork_weak_calls('release')
+
+
+    os.register_at_fork(before=_before_at_fork_weak_calls,
+                        after_in_child=_after_at_fork_weak_calls,
+                        after_in_parent=_after_at_fork_weak_calls)
+
+
 #---------------------------------------------------------------------------
 #   The logging record
 #---------------------------------------------------------------------------
@@ -795,6 +844,7 @@ class Handler(Filterer):
        Acquire a thread lock for serializing access to the underlying I/O.
        """
        self.lock = threading.RLock()
+        _register_at_fork_acquire_release(self)

    def acquire(self):
        """

--- a/Lib/test/test_logging.py
+++ b/Lib/test/test_logging.py
@@ -35,6 +35,7 @@ import os
 import queue
 import random
 import re
+import signal
 import socket
 import struct
 import sys
@@ -666,6 +667,72 @@ class HandlerTest(BaseTest):
                if os.path.exists(fn):
                    os.unlink(fn)

+    # The implementation relies on os.register_at_fork existing, but we test
+    # based on os.fork existing because that is what users and this test use.
+    # This helps ensure that when fork exists (the important concept) that the
+    # register_at_fork mechanism is also present and used.
+    @unittest.skipIf(not hasattr(os, 'fork'), 'Test requires os.fork().')
+    def test_post_fork_child_no_deadlock(self):
+        """Ensure forked child logging locks are not held; bpo-6721."""
+        refed_h = logging.Handler()
+        refed_h.name = 'because we need at least one for this test'
+        self.assertGreater(len(logging._handlers), 0)
+
+        locks_held__ready_to_fork = threading.Event()
+        fork_happened__release_locks_and_end_thread = threading.Event()
+
+        def lock_holder_thread_fn():
+            logging._acquireLock()
+            try:
+                refed_h.acquire()
+                try:
+                    # Tell the main thread to do the fork.
+                    locks_held__ready_to_fork.set()
+
+                    # If the deadlock bug exists, the fork will happen
+                    # without dealing with the locks we hold, deadlocking
+                    # the child.
+
+                    # Wait for a successful fork or an unreasonable amount of
+                    # time before releasing our locks.  To avoid a timing based
+                    # test we'd need communication from os.fork() as to when it
+                    # has actually happened.  Given this is a regression test
+                    # for a fixed issue, potentially less reliably detecting
+                    # regression via timing is acceptable for simplicity.
+                    # The test will always take at least this long. :(
+                    fork_happened__release_locks_and_end_thread.wait(0.5)
+                finally:
+                    refed_h.release()
+            finally:
+                logging._releaseLock()
+
+        lock_holder_thread = threading.Thread(
+                target=lock_holder_thread_fn,
+                name='test_post_fork_child_no_deadlock lock holder')
+        lock_holder_thread.start()
+
+        locks_held__ready_to_fork.wait()
+        pid = os.fork()
+        if pid == 0:  # Child.
+            logging.error(r'Child process did not deadlock. \o/')
+            os._exit(0)
+        else:  # Parent.
+            fork_happened__release_locks_and_end_thread.set()
+            lock_holder_thread.join()
+            start_time = time.monotonic()
+            while True:
+                waited_pid, status = os.waitpid(pid, os.WNOHANG)
+                if waited_pid == pid:
+                    break  # child process exited.
+                if time.monotonic() - start_time > 7:
+                    break  # so long? implies child deadlock.
+                time.sleep(0.05)
+            if waited_pid != pid:
+                os.kill(pid, signal.SIGKILL)
+                waited_pid, status = os.waitpid(pid, 0)
+                self.fail("child process deadlocked.")
+            self.assertEqual(status, 0, msg="child process error")
+

 class BadStream(object):
    def write(self, data):

--- a/Misc/NEWS.d/next/Library/2018-09-13-10-09-19.bpo-6721.ZUL_F3.rst
+++ b/Misc/NEWS.d/next/Library/2018-09-13-10-09-19.bpo-6721.ZUL_F3.rst
+Acquire the logging module's commonly used internal locks while fork()ing to
+avoid deadlocks in the child process.