Merge pull request #1718 from gevent/issue1698

Rework the way Semaphore deals with cross-thread usage

Merge pull request #1718 from gevent/issue1698
Rework the way Semaphore deals with cross-thread usage
9a22eac9 · Jason Madden · GitHub · 00337094 · c79a1978 · 9a22eac9
Commit 9a22eac9 authored Dec 16, 2020 by Jason Madden Committed by GitHub Dec 16, 2020
17 changed files
--- a/docs/changes/1698.bugfix
+++ b/docs/changes/1698.bugfix
+Improve the ability to use monkey-patched locks, and
+`gevent.lock.BoundedSemaphore`, across threads, especially when the
+various threads might not have a gevent hub or any other active
+greenlets. In particular, this handles some cases that previously
+raised ``LoopExit`` or would hang. Note that this may not be reliable
+on PyPy on Windows; such an environment is not currently recommended.
+
+The semaphore tries to avoid creating a hub if it seems unnecessary,
+automatically creating one in the single-threaded case when it would
+block, but not in the multi-threaded case. While the differences
+should be correctly detected, it's possible there are corner cases
+where they might not be.
+
+If your application appears to hang acquiring semaphores, but adding a
+call to ``gevent.get_hub()`` in the thread attempting to acquire the
+semaphore before doing so fixes it, please file an issue.
--- a/scripts/releases/make-manylinux
+++ b/scripts/releases/make-manylinux
@@ -45,9 +45,17 @@ else
    echo "Compiling with -Ofast"
    export CFLAGS="-Ofast $GEVENT_WARNFLAGS"
 fi
-# Needed for clock_gettime libc support on this version. This used to be spelled with LDFLAGS,
-# but that is deprecated and produces a warning.
-export LIBS="-lrt"
+# -lrt: Needed for clock_gettime libc support on this version.
+# -pthread: Needed for pthread_atfork (cffi).
+
+# This used to be spelled with LDFLAGS, but that is deprecated and
+# produces a warning on the 2014 image (?). Still needed on the
+# 2010 image.
+export LIBS="-lrt -pthread"
+export LDFLAGS="$LIBS"
+# Be sure that we get the loop we expect by default, and not
+# a fallback loop.
+export GEVENT_LOOP="libev-cext"

 if [ -d /gevent -a -d /opt/python ]; then
    # Running inside docker
@@ -117,7 +125,10 @@ if [ -d /gevent -a -d /opt/python ]; then
        python -c 'from __future__ import print_function; import gevent; print(gevent, gevent.__version__)'
        python -c 'from __future__ import print_function; from gevent._compat import get_clock_info; print("clock info", get_clock_info("perf_counter"))'
        python -c 'from __future__ import print_function; import greenlet; print(greenlet, greenlet.__version__)'
-        python -c 'from __future__ import print_function; import gevent.core; print("loop", gevent.core.loop)'
+        python -c 'from __future__ import print_function; import gevent.core; print("default loop", gevent.core.loop)'
+        # Other loops we should have
+        GEVENT_LOOP=libuv python -c 'from __future__ import print_function; import gevent.core; print("libuv loop", gevent.core.loop)'
+        GEVENT_LOOP=libev-cffi python -c 'from __future__ import print_function; import gevent.core; print("libev-cffi loop", gevent.core.loop)'
        if [ -z "$GEVENTSETUP_DISABLE_ARES" ]; then
            python -c 'from __future__ import print_function; import gevent.ares; print("ares", gevent.ares)'
        fi

--- a/src/gevent/_abstract_linkable.py
+++ b/src/gevent/_abstract_linkable.py
@@ -10,10 +10,14 @@ from __future__ import print_function

 import sys

+from greenlet import error as greenlet_error
+
+from gevent._compat import thread_mod_name
 from gevent._hub_local import get_hub_noargs as get_hub
 from gevent._hub_local import get_hub_if_exists

 from gevent.exceptions import InvalidSwitchError
+from gevent.exceptions import InvalidThreadUseError
 from gevent.timeout import Timeout

 locals()['getcurrent'] = __import__('greenlet').getcurrent
@@ -23,6 +27,19 @@ __all__ = [
    'AbstractLinkable',
 ]

+# Need the real get_ident. We're imported early enough during monkey-patching
+# that we can be sure nothing is monkey patched yet.
+_get_thread_ident = __import__(thread_mod_name).get_ident
+_allocate_thread_lock = __import__(thread_mod_name).allocate_lock
+
+class _FakeNotifier(object):
+    __slots__ = (
+        'pending',
+    )
+
+    def __init__(self):
+        self.pending = False
+
 class AbstractLinkable(object):
    # Encapsulates the standard parts of the linking and notifying
    # protocol common to both repeatable events (Event, Semaphore) and
@@ -38,7 +55,9 @@ class AbstractLinkable(object):
    # 2b. A subclass ensures that a Python-level native thread lock is held
    #     for the duration of the method; this is necessary in pure-Python mode.
    #     The only known implementation of such
-    #     a subclass is for Semaphore.
+    #     a subclass is for Semaphore. AND
+    # 3. The subclass that calls ``capture_hub`` catches
+    #    and handles ``InvalidThreadUseError``
    #
    # TODO: As of gevent 1.5, we use the same datastructures and almost
    # the same algorithm as Greenlet. See about unifying them more.
@@ -85,9 +104,12 @@ class AbstractLinkable(object):
        # we don't want to do get_hub() here to allow defining module-level objects
        # without initializing the hub. However, for multiple-thread safety, as soon
        # as a waiting method is entered, even if it won't have to wait, we
-        # need to grab the hub and assign ownership. For that reason, if the hub
-        # is present, we'll go ahead and take it.
-        self.hub = hub if hub is not None else get_hub_if_exists()
+        # need to grab the hub and assign ownership. But we don't want to grab one prematurely.
+        # The example is three threads, the main thread and two worker threads; if we create
+        # a Semaphore in the main thread but only use it in the two threads, if we had grabbed
+        # the main thread's hub, the two worker threads would have a dependency on it, meaning that
+        # if the main event loop is blocked, the worker threads might get blocked too.
+        self.hub = hub

    def linkcount(self):
        # For testing: how many objects are linked to this one?
@@ -127,25 +149,74 @@ class AbstractLinkable(object):
            # _notify_links method.
            self._notifier.stop()

+    def _allocate_lock(self):
+        return _allocate_thread_lock()
+
+    def _getcurrent(self):
+        return getcurrent() # pylint:disable=undefined-variable
+
+    def _get_thread_ident(self):
+        return _get_thread_ident()
+
    def _capture_hub(self, create):
        # Subclasses should call this as the first action from any
        # public method that could, in theory, block and switch
-        # to the hub. This may release the GIL.
+        # to the hub. This may release the GIL. It may
+        # raise InvalidThreadUseError if the result would
+
+        # First, detect a dead hub and drop it.
+        while 1:
+            my_hub = self.hub
+            if my_hub is None:
+                break
+            if my_hub.dead: # dead is a property, could release GIL
+                # back, holding GIL
+                if self.hub is my_hub:
+                    self.hub = None
+                    my_hub = None
+                    break
+            else:
+                break
+
        if self.hub is None:
            # This next line might release the GIL.
            current_hub = get_hub() if create else get_hub_if_exists()
-            if current_hub is None:
-                return
+
            # We have the GIL again. Did anything change? If so,
            # we lost the race.
            if self.hub is None:
                self.hub = current_hub

+        if self.hub is not None and self.hub.thread_ident != _get_thread_ident():
+            raise InvalidThreadUseError(
+                self.hub,
+                get_hub_if_exists(),
+                getcurrent() # pylint:disable=undefined-variable
+            )
+        return self.hub
+
    def _check_and_notify(self):
        # If this object is ready to be notified, begin the process.
        if self.ready() and self._links and not self._notifier:
-            self._capture_hub(True) # Must create, we need it.
-            self._notifier = self.hub.loop.run_callback(self._notify_links, [])
+            hub = None
+            try:
+                hub = self._capture_hub(False) # Must create, we need it.
+            except InvalidThreadUseError:
+                # The current hub doesn't match self.hub. That's OK,
+                # we still want to start the notifier in the thread running
+                # self.hub (because the links probably contains greenlet.switch
+                # calls valid only in that hub)
+                pass
+            if hub is not None:
+                self._notifier = hub.loop.run_callback(self._notify_links, [])
+            else:
+                # Hmm, no hub. We must be the only thing running. Then its OK
+                # to just directly call the callbacks.
+                self._notifier = 1
+                try:
+                    self._notify_links([])
+                finally:
+                    self._notifier = None

    def _notify_link_list(self, links):
        # The core of the _notify_links method to notify
@@ -155,6 +226,8 @@ class AbstractLinkable(object):
        only_while_ready = not self._notify_all
        final_link = links[-1]
        done = set() # of ids
+        hub = self.hub if self.hub is not None else get_hub_if_exists()
+        unswitched = []
        while links: # remember this can be mutated
            if only_while_ready and not self.ready():
                break
@@ -164,20 +237,31 @@ class AbstractLinkable(object):
            if id_link not in done:
                # XXX: JAM: What was I thinking? This doesn't make much sense,
                # there's a good chance `link` will be deallocated, and its id() will
-                # be free to be reused.
+                # be free to be reused. This also makes looping difficult, you have to
+                # create new functions inside a loop rather than just once outside the loop.
                done.add(id_link)
                try:
                    self._drop_lock_for_switch_out()
                    try:
                        link(self)
+                    except greenlet_error:
+                        # couldn't switch to a greenlet, we must be
+                        # running in a different thread. back on the list it goes for next time.
+                        unswitched.append(link)
                    finally:
                        self._acquire_lock_for_switch_in()
+
                except: # pylint:disable=bare-except
                    # We're running in the hub, errors must not escape.
-                    self.hub.handle_error((link, self), *sys.exc_info())
+                    if hub is not None:
+                        hub.handle_error((link, self), *sys.exc_info())
+                    else:
+                        import traceback
+                        traceback.print_exc()

            if link is final_link:
                break
+        return unswitched

    def _notify_links(self, arrived_while_waiting):
        # This method must hold the GIL, or be guarded with the lock that guards
@@ -211,12 +295,13 @@ class AbstractLinkable(object):
        # any more. In that case, we must not keep notifying anyone that's
        # newly added after that, even if we go ready again.
        try:
-            self._notify_link_list(self._links)
+            unswitched = self._notify_link_list(self._links)
            # Now, those that arrived after we had begun the notification
            # process. Follow the same rules, stop with those that are
            # added so far to prevent starvation.
            if arrived_while_waiting:
-                self._notify_link_list(arrived_while_waiting)
+                un2 = self._notify_link_list(arrived_while_waiting)
+                unswitched.extend(un2)

                # Anything left needs to go back on the main list.
                self._links.extend(arrived_while_waiting)
@@ -230,13 +315,22 @@ class AbstractLinkable(object):
            # free up thread affinity? In case of a pathological situation where
            # one object was used from one thread once & first,  but usually is
            # used by another thread.
+            #
+            # BoundedSemaphore does this.
        # Now we may be ready or not ready. If we're ready, which
        # could have happened during the last link we called, then we
        # must have more links than we started with. We need to schedule the
        # wakeup.
        self._check_and_notify()
-
-    def __unlink_all(self, obj):
+        # If we added unswitched greenlets, however, don't add them back to the links yet.
+        # We wouldn't be able to call them in this hub anyway.
+        # TODO: Instead of just adding these back to self._links, we should try to detect their
+        # "home" hub and mode the callback to that hub. As it stands, there's a chance that
+        # if no greenlet tries to acquire/release this object in that hub, these objects
+        # will never get to run.
+        self._links.extend(unswitched)
+
+    def _quiet_unlink_all(self, obj):
        if obj is None:
            return

@@ -248,75 +342,32 @@ class AbstractLinkable(object):
                pass

    def __wait_to_be_notified(self, rawlink): # pylint:disable=too-many-branches
-        # We've got to watch where we could potentially release the GIL.
-        # Decisions we make based an the state of this object must be in blocks
-        # that cannot release the GIL.
-        resume_this_greenlet = None
-        watcher = None
-        current_hub = get_hub()
-        send = None
-
-        while 1:
-            my_hub = self.hub
-            if my_hub is current_hub:
-                break
-
-            # We're owned by another hub.
-            if my_hub.dead: # dead is a property, this could have released the GIL.
-                # We have the GIL back. Did anything change?
-                if my_hub is not self.hub:
-                    continue # start over.
-                # The other hub is dead, so we can take ownership.
-                self.hub = current_hub
-                break
-            # Some other hub owns this object. We must ask it to wake us
-            # up. We can't use a Python-level ``Lock`` because
-            # (1) it doesn't support a timeout on all platforms; and
-            # (2) we don't want to block this hub from running. So we need to
-            # do so in a way that cooperates with *two* hubs. That's what an
-            # async watcher is built for.
-            #
-            # Allocating and starting the watcher *could* release the GIL.
-            # with the libev corcext, allocating won't, but starting briefly will.
-            # With other backends, allocating might, and starting might also.
-            # So...XXX: Race condition here, tiny though it may be.
-            watcher = current_hub.loop.async_()
-            send = watcher.send_ignoring_arg
-            if rawlink:
-                # Make direct calls to self.rawlink, the most common case,
-                # so cython can more easily optimize.
-                self.rawlink(send)
-            else:
-                self._notifier.args[0].append(send)
-
-            watcher.start(getcurrent().switch, self) # pylint:disable=undefined-variable
-            break
+        resume_this_greenlet = getcurrent().switch # pylint:disable=undefined-variable
+        if rawlink:
+            self.rawlink(resume_this_greenlet)
+        else:
+            self._notifier.args[0].append(resume_this_greenlet)

-        if self.hub is current_hub:
-            resume_this_greenlet = getcurrent().switch # pylint:disable=undefined-variable
-            if rawlink:
-                self.rawlink(resume_this_greenlet)
-            else:
-                self._notifier.args[0].append(resume_this_greenlet)
        try:
-            self._drop_lock_for_switch_out()
-            result = current_hub.switch() # Probably releases
+            self._switch_to_hub(self.hub)
            # If we got here, we were automatically unlinked already.
            resume_this_greenlet = None
-            if result is not self: # pragma: no cover
-                raise InvalidSwitchError(
-                    'Invalid switch into %s.wait(): %r' % (
-                        self.__class__.__name__,
-                        result,
-                    )
-                )
+        finally:
+            self._quiet_unlink_all(resume_this_greenlet)
+
+    def _switch_to_hub(self, the_hub):
+        self._drop_lock_for_switch_out()
+        try:
+            result = the_hub.switch()
        finally:
            self._acquire_lock_for_switch_in()
-            self.__unlink_all(resume_this_greenlet)
-            self.__unlink_all(send)
-            if watcher is not None:
-                watcher.stop()
-                watcher.close()
+        if result is not self: # pragma: no cover
+            raise InvalidSwitchError(
+                'Invalid switch into %s.wait(): %r' % (
+                    self.__class__.__name__,
+                    result,
+                )
+            )

    def _acquire_lock_for_switch_in(self):
        return
@@ -329,12 +380,12 @@ class AbstractLinkable(object):
        The core of the wait implementation, handling switching and
        linking.

-        This method is safe to call from multiple threads; it must be holding
-        the GIL for the entire duration, or be protected by a Python-level
-        lock for that to be true.
+        This method is NOT safe to call from multiple threads.

        ``self.hub`` must be initialized before entering this method.
-        The hub that is set is considered the owner and cannot be changed.
+        The hub that is set is considered the owner and cannot be changed
+        while this method is running. It must only be called from the thread
+        where ``self.hub`` is the current hub.

        If *catch* is set to ``()``, a timeout that elapses will be
        allowed to be raised.
@@ -344,8 +395,11 @@ class AbstractLinkable(object):
          resumed in this greenlet.
        """
        with Timeout._start_new_or_dummy(timeout) as timer: # Might release
+            # We already checked above (_wait()) if we're ready()
            try:
-                self.__wait_to_be_notified(True) # Use rawlink()
+                self.__wait_to_be_notified(
+                    True,# Use rawlink()
+                )
                return True
            except catch as ex:
                if ex is not timer:
@@ -361,10 +415,6 @@ class AbstractLinkable(object):
        return None # pragma: no cover all extent subclasses override

    def _wait(self, timeout=None):
-        """
-        This method is safe to call from multiple threads, providing
-        the conditions laid out in the class documentation are met.
-        """
        # Watch where we could potentially release the GIL.
        self._capture_hub(True) # Must create, we must have an owner. Might release


--- a/src/gevent/_ffi/loop.py
+++ b/src/gevent/_ffi/loop.py
@@ -763,6 +763,7 @@ class AbstractLoop(object):
            msg += ' fileno=' + repr(fileno)
        #if sigfd is not None and sigfd != -1:
        #    msg += ' sigfd=' + repr(sigfd)
+        msg += ' callbacks=' + str(len(self._callbacks))
        return msg

    def fileno(self):

--- a/src/gevent/_gevent_c_abstract_linkable.pxd
+++ b/src/gevent/_gevent_c_abstract_linkable.pxd
@@ -5,7 +5,9 @@ from gevent._gevent_c_hub_local cimport get_hub_noargs as get_hub
 from gevent._gevent_c_hub_local cimport get_hub_if_exists

 cdef InvalidSwitchError
+cdef InvalidThreadUseError
 cdef Timeout
+cdef _get_thread_ident
 cdef bint _greenlet_imported

 cdef extern from "greenlet/greenlet.h":
@@ -30,6 +32,9 @@ cdef inline void greenlet_init():

 cdef void _init()

+cdef class _FakeNotifier(object):
+    cdef bint pending
+
 cdef class AbstractLinkable(object):
   # We declare the __weakref__ here in the base (even though
   # that's not really what we want) as a workaround for a Cython
@@ -49,12 +54,14 @@ cdef class AbstractLinkable(object):
   cpdef unlink(self, callback)

   cdef _check_and_notify(self)
-   cdef void _capture_hub(self, bint create)
+   cdef SwitchOutGreenletWithLoop _capture_hub(self, bint create)
   cdef __wait_to_be_notified(self, bint rawlink)
-   cdef void __unlink_all(self, obj) # suppress exceptions
+
+   cdef void _quiet_unlink_all(self, obj) # suppress exceptions
+   cdef int _switch_to_hub(self, the_hub) except -1

   @cython.nonecheck(False)
-   cdef _notify_link_list(self, list links)
+   cdef list _notify_link_list(self, list links)

   @cython.nonecheck(False)
   cpdef _notify_links(self, list arrived_while_waiting)
@@ -63,5 +70,10 @@ cdef class AbstractLinkable(object):
   cpdef _acquire_lock_for_switch_in(self)

   cdef _wait_core(self, timeout, catch=*)
-   cdef _wait_return_value(self, waited, wait_success)
+   cdef _wait_return_value(self, bint waited, bint wait_success)
   cdef _wait(self, timeout=*)
+
+   # Unreleated utilities
+   cdef _allocate_lock(self)
+   cdef greenlet _getcurrent(self)
+   cdef _get_thread_ident(self)
--- a/src/gevent/_gevent_c_semaphore.pxd
+++ b/src/gevent/_gevent_c_semaphore.pxd
 cimport cython

+from gevent._gevent_c_greenlet_primitives cimport SwitchOutGreenletWithLoop
 from gevent._gevent_c_abstract_linkable cimport AbstractLinkable
+from gevent._gevent_c_hub_local cimport get_hub_if_exists
+from gevent._gevent_c_hub_local cimport get_hub_noargs as get_hub
+
+cdef InvalidThreadUseError
+cdef LoopExit
 cdef Timeout
+cdef _native_sleep
+cdef monotonic
+cdef spawn_raw
+
+cdef class _LockReleaseLink(object):
+    cdef object lock


 cdef class Semaphore(AbstractLinkable):
    cdef public int counter

+    cdef long _multithreaded
+
    cpdef bint locked(self)
    cpdef int release(self) except -1000
    # We don't really want this to be public, but
    # threadpool uses it
    cpdef _start_notify(self)
    cpdef int wait(self, object timeout=*) except -1000
+    @cython.locals(
+        success=bint,
+        e=Exception,
+        ex=Exception,
+        args=tuple,
+    )
    cpdef bint acquire(self, bint blocking=*, object timeout=*) except -1000
    cpdef __enter__(self)
    cpdef __exit__(self, object t, object v, object tb)

+    @cython.locals(
+        hub_for_this_thread=SwitchOutGreenletWithLoop,
+        owning_hub=SwitchOutGreenletWithLoop,
+    )
+    cdef __acquire_from_other_thread(self, tuple args, bint blocking, timeout)
+    cpdef __acquire_from_other_thread_cb(self, list results, bint blocking, timeout, thread_lock)
+
+    cdef __add_link(self, link)
+    cdef __acquire_using_two_hubs(self,
+                                  SwitchOutGreenletWithLoop hub_for_this_thread,
+                                  current_greenlet,
+                                  timeout)
+    cdef __acquire_using_other_hub(self, SwitchOutGreenletWithLoop owning_hub, timeout)
+    cdef bint __acquire_without_hubs(self, timeout)
+    cdef bint __spin_on_native_lock(self, thread_lock, timeout)
+
 cdef class BoundedSemaphore(Semaphore):
    cdef readonly int _initial_value


--- a/src/gevent/_semaphore.py
+++ b/src/gevent/_semaphore.py
@@ -14,12 +14,33 @@ __all__ = [
    'BoundedSemaphore',
 ]

+from time import sleep as _native_sleep
+
+from gevent._compat import monotonic
+from gevent.exceptions import InvalidThreadUseError
+from gevent.exceptions import LoopExit
+from gevent.timeout import Timeout
+
 def _get_linkable():
    x = __import__('gevent._abstract_linkable')
    return x._abstract_linkable.AbstractLinkable
 locals()['AbstractLinkable'] = _get_linkable()
 del _get_linkable

+from gevent._hub_local import get_hub_if_exists
+from gevent._hub_local import get_hub
+from gevent.hub import spawn_raw
+
+class _LockReleaseLink(object):
+    __slots__ = (
+        'lock',
+    )
+
+    def __init__(self, lock):
+        self.lock = lock
+
+    def __call__(self, _):
+        self.lock.release()

 class Semaphore(AbstractLinkable): # pylint:disable=undefined-variable
    """
@@ -42,7 +63,6 @@ class Semaphore(AbstractLinkable): # pylint:disable=undefined-variable
    This Semaphore's ``__exit__`` method does not call the trace function
    on CPython, but does under PyPy.

-
    .. versionchanged:: 1.4.0
        Document that the order in which waiters are awakened is not specified. It was not
        specified previously, but due to CPython implementation quirks usually went in FIFO order.
@@ -51,10 +71,17 @@ class Semaphore(AbstractLinkable): # pylint:disable=undefined-variable
    .. versionchanged:: 1.5a3
       The low-level ``rawlink`` method (most users won't use this) now automatically
       unlinks waiters before calling them.
+    .. versionchanged:: NEXT
+       Improved support for multi-threaded usage. When multi-threaded usage is detected,
+       instances will no longer create the thread's hub if it's not present.
    """

    __slots__ = (
        'counter',
+        # Integer. Set to 0 initially. Set to the ident of the first
+        # thread that acquires us. If we later see a different thread
+        # ident, set to -1.
+        '_multithreaded',
    )

    def __init__(self, value=1, hub=None):
@@ -63,10 +90,15 @@ class Semaphore(AbstractLinkable): # pylint:disable=undefined-variable
            raise ValueError("semaphore initial value must be >= 0")
        super(Semaphore, self).__init__(hub)
        self._notify_all = False
+        self._multithreaded = 0

    def __str__(self):
-        params = (self.__class__.__name__, self.counter, self.linkcount())
-        return '<%s counter=%s _links[%s]>' % params
+        return '<%s at 0x%x counter=%s _links[%s]>' % (
+            self.__class__.__name__,
+            id(self),
+            self.counter,
+            self.linkcount()
+        )

    def locked(self):
        """
@@ -162,28 +194,78 @@ class Semaphore(AbstractLinkable): # pylint:disable=undefined-variable
           the semaphore was acquired, False will be returned. (Note that this can still
           raise a ``Timeout`` exception, if some other caller had already started a timer.)
        """
-        if self.counter > 0:
-            # We conceptually now belong to the hub of
-            # the thread that called this, even though we didn't
-            # have to block. Note that we cannot force it to be created
-            # yet, because Semaphore is used by importlib.ModuleLock
-            # which is used when importing the hub itself!
+        # pylint:disable=too-many-return-statements,too-many-branches
+        # Sadly, the body of this method is rather complicated.
+        if self._multithreaded == 0:
+            self._multithreaded = self._get_thread_ident()
+        elif self._multithreaded != self._get_thread_ident():
+            self._multithreaded = -1
+
+        # We conceptually now belong to the hub of the thread that
+        # called this, whether or not we have to block. Note that we
+        # cannot force it to be created yet, because Semaphore is used
+        # by importlib.ModuleLock which is used when importing the hub
+        # itself! This also checks for cross-thread issues.
+        invalid_thread_use = None
+        try:
            self._capture_hub(False)
+        except InvalidThreadUseError as e:
+            # My hub belongs to some other thread. We didn't release the GIL/object lock
+            # by raising the exception, so we know this is still true.
+            invalid_thread_use = e.args
+            e = None
+            if not self.counter and blocking:
+                # We would need to block. So coordinate with the main hub.
+                return self.__acquire_from_other_thread(invalid_thread_use, blocking, timeout)
+
+        if self.counter > 0:
            self.counter -= 1
            return True

        if not blocking:
            return False

-        success = self._wait(timeout)
+        if self._multithreaded != -1 and self.hub is None: # pylint:disable=access-member-before-definition
+            self.hub = get_hub() # pylint:disable=attribute-defined-outside-init
+
+        if self.hub is None and not invalid_thread_use:
+            # Someone else is holding us. There's not a hub here,
+            # nor is there a hub in that thread. We'll need to use regular locks.
+            # This will be unfair to yet a third thread that tries to use us with greenlets.
+            return self.__acquire_from_other_thread(
+                (None, None, self._getcurrent(), "NoHubs"),
+                blocking,
+                timeout
+            )
+
+        # self._wait may drop both the GIL and the _lock_lock.
+        # By the time we regain control, both have been reacquired.
+        try:
+            success = self._wait(timeout)
+        except LoopExit as ex:
+            args = ex.args
+            ex = None
+            if self.counter:
+                success = True
+            else:
+                # Avoid using ex.hub property to keep holding the GIL
+                if len(args) == 3 and args[1].main_hub:
+                    # The main hub, meaning the main thread. We probably can do nothing with this.
+                    raise
+                return self.__acquire_from_other_thread(
+                    (self.hub, get_hub_if_exists(), self._getcurrent(), "LoopExit"),
+                    blocking,
+                    timeout)
+
        if not success:
+            assert timeout is not None
            # Our timer expired.
            return False

-        # Neither our timer no another one expired, so we blocked until
+        # Neither our timer or another one expired, so we blocked until
        # awoke. Therefore, the counter is ours
+        assert self.counter > 0, (self.counter, blocking, timeout, success,)
        self.counter -= 1
-        assert self.counter >= 0
        return True

    _py3k_acquire = acquire # PyPy needs this; it must be static for Cython
@@ -194,6 +276,168 @@ class Semaphore(AbstractLinkable): # pylint:disable=undefined-variable
    def __exit__(self, t, v, tb):
        self.release()

+    def __add_link(self, link):
+        if not self._notifier:
+            self.rawlink(link)
+        else:
+            self._notifier.args[0].append(link)
+
+    def __acquire_from_other_thread(self, ex_args, blocking, timeout):
+        assert blocking
+        # Some other hub owns this object. We must ask it to wake us
+        # up. In general, we can't use a Python-level ``Lock`` because
+        #
+        # (1) it doesn't support a timeout on all platforms; and
+        # (2) we don't want to block this hub from running.
+        #
+        # So we need to do so in a way that cooperates with *two*
+        # hubs. That's what an async watcher is built for.
+        #
+        # Of course, if we don't actually have two hubs, then we must find some other
+        # solution. That involves using a lock.
+
+        # We have to take an action that drops the GIL and drops the object lock
+        # to allow the main thread (the thread for our hub) to advance.
+        owning_hub = ex_args[0]
+        hub_for_this_thread = ex_args[1]
+        current_greenlet = ex_args[2]
+
+        if owning_hub is None and hub_for_this_thread is None:
+            return self.__acquire_without_hubs(timeout)
+
+        if hub_for_this_thread is None:
+            # Probably a background worker thread. We don't want to create
+            # the hub if not needed, and since it didn't exist there are no
+            # other greenlets that we could yield to anyway, so there's nothing
+            # to block and no reason to try to avoid blocking, so using a native
+            # lock is the simplest way to go.
+            return self.__acquire_using_other_hub(owning_hub, timeout)
+
+        # We have a hub we don't want to block. Use an async watcher
+        # and ask the next releaser of this object to wake us up.
+        return self.__acquire_using_two_hubs(hub_for_this_thread,
+                                             current_greenlet,
+                                             timeout)
+
+    def __acquire_using_two_hubs(self,
+                                 hub_for_this_thread,
+                                 current_greenlet,
+                                 timeout):
+        # Allocating and starting the watcher *could* release the GIL.
+        # with the libev corcext, allocating won't, but starting briefly will.
+        # With other backends, allocating might, and starting might also.
+        # So...
+        watcher = hub_for_this_thread.loop.async_()
+        send = watcher.send_ignoring_arg
+        watcher.start(current_greenlet.switch, self)
+        try:
+            with Timeout._start_new_or_dummy(timeout) as timer:
+                # ... now that we're back holding the GIL, we need to verify our
+                # state.
+                try:
+                    while 1:
+                        if self.counter > 0:
+                            self.counter -= 1
+                            assert self.counter >= 0, (self,)
+                            return True
+
+                        self.__add_link(send)
+
+                        # Releases the object lock
+                        self._switch_to_hub(hub_for_this_thread)
+                        # We waited and got notified. We should be ready now, so a non-blocking
+                        # acquire() should succeed. But sometimes we get spurious notifications?
+                        # It's not entirely clear how. So we need to loop until we get it, or until
+                        # the timer expires
+                        result = self.acquire(0)
+                        if result:
+                            return result
+                except Timeout as tex:
+                    if tex is not timer:
+                        raise
+                    return False
+        finally:
+            self._quiet_unlink_all(send)
+            watcher.stop()
+            watcher.close()
+
+    def __acquire_from_other_thread_cb(self, results, blocking, timeout, thread_lock):
+        try:
+            result = self.acquire(blocking, timeout)
+            results.append(result)
+        finally:
+            thread_lock.release()
+        return result
+
+    def __acquire_using_other_hub(self, owning_hub, timeout):
+        assert owning_hub is not get_hub_if_exists()
+        thread_lock = self._allocate_lock()
+        thread_lock.acquire()
+        results = []
+
+        owning_hub.loop.run_callback(
+            spawn_raw,
+            self.__acquire_from_other_thread_cb,
+            results,
+            1,       # blocking,
+            timeout, # timeout,
+            thread_lock)
+
+        # We MUST use a blocking acquire here, or at least be sure we keep going
+        # until we acquire it. If we timed out waiting here,
+        # just before the callback runs, then we would be out of sync.
+        self.__spin_on_native_lock(thread_lock, None)
+        return results[0]
+
+    def __acquire_without_hubs(self, timeout):
+        thread_lock = self._allocate_lock()
+        thread_lock.acquire()
+        absolute_expiration = 0
+        begin = 0
+        if timeout:
+            absolute_expiration = monotonic() + timeout
+
+        # Cython won't compile a lambda here
+        link = _LockReleaseLink(thread_lock)
+        while 1:
+            self.__add_link(link)
+            if absolute_expiration:
+                begin = monotonic()
+
+            got_native = self.__spin_on_native_lock(thread_lock, timeout)
+            self._quiet_unlink_all(link)
+            if got_native:
+                if self.acquire(0):
+                    return True
+            if absolute_expiration:
+                now = monotonic()
+                if now >= absolute_expiration:
+                    return False
+                duration = now - begin
+                timeout -= duration
+                if timeout <= 0:
+                    return False
+
+    def __spin_on_native_lock(self, thread_lock, timeout):
+        expiration = 0
+        if timeout:
+            expiration = monotonic() + timeout
+
+        self._drop_lock_for_switch_out()
+        try:
+            # TODO: When timeout is given and the lock supports that
+            # (Python 3), pass that.
+            # Python 2 has terrible behaviour where lock acquires can't
+            # be interrupted, so we use a spin loop
+            while not thread_lock.acquire(0):
+                if expiration and monotonic() >= expiration:
+                    return False
+
+                _native_sleep(0.001)
+            return True
+        finally:
+            self._acquire_lock_for_switch_in()
+

 class BoundedSemaphore(Semaphore):
    """
@@ -208,6 +452,10 @@ class BoundedSemaphore(Semaphore):
    If not given, *value* defaults to 1.
    """

+    __slots__ = (
+        '_initial_value',
+    )
+
    #: For monkey-patching, allow changing the class of error we raise
    _OVER_RELEASE_ERROR = ValueError

@@ -222,7 +470,13 @@ class BoundedSemaphore(Semaphore):
        """
        if self.counter >= self._initial_value:
            raise self._OVER_RELEASE_ERROR("Semaphore released too many times")
-        return Semaphore.release(self)
+        counter = Semaphore.release(self)
+        # When we are absolutely certain that no one holds this semaphore,
+        # release our hub and go back to floating. This assists in cross-thread
+        # uses.
+        if counter == self._initial_value:
+            self.hub = None # pylint:disable=attribute-defined-outside-init
+        return counter

    def _at_fork_reinit(self):
        super(BoundedSemaphore, self)._at_fork_reinit()

--- a/src/gevent/exceptions.py
+++ b/src/gevent/exceptions.py
@@ -36,12 +36,28 @@ class LoopExit(Exception):

    """

+    @property
+    def hub(self):
+        """
+        The (optional) hub that raised the error.
+
+        .. versionadded:: NEXT
+        """
+        # XXX: Note that semaphore.py does this manually.
+        if len(self.args) == 3: # From the hub
+            return self.args[1]
+
    def __repr__(self):
        # pylint:disable=unsubscriptable-object
        if len(self.args) == 3: # From the hub
            import pprint
-            return "%s\n\tHub: %s\n\tHandles:\n%s" % (
-                self.args[0], self.args[1],
+            return (
+                "%s\n"
+                "\tHub: %s\n"
+                "\tHandles:\n%s"
+            ) % (
+                self.args[0],
+                self.args[1],
                pprint.pformat(self.args[2])
            )
        return Exception.__repr__(self)

--- a/src/gevent/libev/corecffi.py
+++ b/src/gevent/libev/corecffi.py
@@ -294,6 +294,10 @@ class loop(AbstractLoop):
            libev.ev_timer_stop(self._timer0)

    def _setup_for_run_callback(self):
+        # XXX: libuv needs to start the callback timer to be sure
+        # that the loop wakes up and calls this. Our C version doesn't
+        # do this.
+        # self._start_callback_timer()
        self.ref() # we should go through the loop now

    def destroy(self):

--- a/src/gevent/lock.py
+++ b/src/gevent/lock.py
@@ -8,6 +8,7 @@ infinite bounds (:class:`DummySemaphore`), along with a reentrant lock
 (:class:`RLock`) with the same API as :class:`threading.RLock`.
 """
 from __future__ import absolute_import
+from __future__ import print_function

 from gevent.hub import getcurrent
 from gevent._compat import PURE_PYTHON
@@ -43,82 +44,77 @@ _allocate_lock, _get_ident = monkey.get_original(
    ('allocate_lock', 'get_ident')
 )

+def atomic(meth):
+    def m(self, *args):
+        with self._atomic:
+            return meth(self, *args)
+    return m

-class _OwnedLock(object):
+
+class _GILLock(object):
    __slots__ = (
-        '_owner',
-        '_block',
-        '_locking',
-        '_count',
+        '_owned_thread_id',
+        '_gil',
+        '_atomic',
+        '_recursion_depth',
    )
-    # Don't allow re-entry to these functions in a single thread, as can
-    # happen if a sys.settrace is used.
+    # Don't allow re-entry to these functions in a single thread, as
+    # can happen if a sys.settrace is used. (XXX: What does that even
+    # mean? Our original implementation that did that has been
+    # replaced by something more robust)
    #
    # This is essentially a variant of the (pure-Python) RLock from the
    # standard library.
    def __init__(self):
-        self._owner = None
-        self._block = _allocate_lock()
-        self._locking = {}
-        self._count = 0
+        self._owned_thread_id = None
+        self._gil = _allocate_lock()
+        self._atomic = _allocate_lock()
+        self._recursion_depth = 0
+
+    @atomic
+    def acquire(self):
+        current_tid = _get_ident()
+        if self._owned_thread_id == current_tid:
+            self._recursion_depth += 1
+            return True
+
+        # Not owned by this thread. Only one thread will make it through this point.
+        while 1:
+            self._atomic.release()
+            try:
+                self._gil.acquire()
+            finally:
+                self._atomic.acquire()
+            if self._owned_thread_id is None:
+                break
+
+        self._owned_thread_id = current_tid
+        self._recursion_depth = 1
+        return True

+    @atomic
+    def release(self):
+        current_tid = _get_ident()
+        if current_tid != self._owned_thread_id:
+            raise RuntimeError("%s: Releasing lock not owned by you. You: 0x%x; Owner: 0x%x" % (
+                self,
+                current_tid, self._owned_thread_id or 0,
+            ))

-    def __begin(self):
-        # Return (me, count) if we should proceed, otherwise return
-        # None. The function should exit in that case.
-        # In either case, it must call __end.
-        me = _get_ident()
-        try:
-            count = self._locking[me]
-        except KeyError:
-            count = self._locking[me] = 1
-        else:
-            count = self._locking[me] = count + 1
-        return (me, count) if not count else (None, None)
-
-    def __end(self, me, count):
-        if me is None:
-            return
-        count = count - 1
-        if not count:
-            del self._locking[me]
-        else:
-            self._locking[me] = count
+        self._recursion_depth -= 1

-    def __enter__(self):
-        me, lock_count = self.__begin()
-        try:
-            if me is None:
-                return
+        if not self._recursion_depth:
+            self._owned_thread_id = None
+            self._gil.release()

-            if self._owner == me:
-                self._count += 1
-                return
-
-            self._block.acquire()
-            self._owner = me
-            self._count = 1
-        finally:
-            self.__end(me, lock_count)
+    def __enter__(self):
+        self.acquire()

    def __exit__(self, t, v, tb):
        self.release()

-    acquire = __enter__
-
-    def release(self):
-        me, lock_count = self.__begin()
-        try:
-            if me is None:
-                return
-
-            self._count = count = self._count - 1
-            if not count:
-                self._owner = None
-                self._block.release()
-        finally:
-            self.__end(me, lock_count)
-
+    def locked(self):
+        return self._gil.locked()

 class _AtomicSemaphoreMixin(object):
    # Behaves as though the GIL was held for the duration of acquire, wait,
@@ -131,7 +127,7 @@ class _AtomicSemaphoreMixin(object):
    # Note that this does *NOT*, in-and-of itself, make semaphores safe to use from multiple threads
    __slots__ = ()
    def __init__(self, *args, **kwargs):
-        self._lock_lock = _OwnedLock() # pylint:disable=assigning-non-slot
+        self._lock_lock = _GILLock() # pylint:disable=assigning-non-slot
        super(_AtomicSemaphoreMixin, self).__init__(*args, **kwargs)

    def _acquire_lock_for_switch_in(self):
@@ -347,7 +343,9 @@ class RLock(object):
        release it.
        """
        if self._owner is not getcurrent():
-            raise RuntimeError("cannot release un-acquired lock")
+            raise RuntimeError("cannot release un-acquired lock. Owner: %r Current: %r" % (
+                self._owner, getcurrent()
+            ))
        self._count = count = self._count - 1
        if not count:
            self._owner = None

--- a/src/gevent/monkey.py
+++ b/src/gevent/monkey.py
@@ -424,6 +424,8 @@ def _check_availability(name):
    :raise ImportError: If the source or target cannot be imported.
    :return: The tuple ``(gevent_module, target_module, target_module_name)``
    """
+    # Always import the gevent module first. This helps us be sure we can
+    # use regular imports in gevent files (when we can't use gevent.monkey.get_original())
    gevent_module = getattr(__import__('gevent.' + name), name)
    target_module_name = getattr(gevent_module, '__target__', name)
    target_module = __import__(target_module_name)
@@ -1216,10 +1218,13 @@ def patch_all(socket=True, dns=True, time=True, select=True, thread=True, os=Tru
    # order is important
    if os:
        patch_os()
-    if time:
-        patch_time()
    if thread:
        patch_thread(Event=Event, _warnings=_warnings)
+    if time:
+        # time must be patched after thread, some modules used by thread
+        # need access to the real time.sleep function.
+        patch_time()
+
    # sys must be patched after thread. in other cases threading._shutdown will be
    # initiated to _MainThread with real thread ident
    if sys:

--- a/src/gevent/testing/__init__.py
+++ b/src/gevent/testing/__init__.py
@@ -93,6 +93,7 @@ from .skipping import skipOnCI
 from .skipping import skipOnPyPy3OnCI
 from .skipping import skipOnPyPy
 from .skipping import skipOnPyPyOnCI
+from .skipping import skipOnPyPyOnWindows
 from .skipping import skipOnPyPy3
 from .skipping import skipIf
 from .skipping import skipUnless

--- a/src/gevent/testing/errorhandler.py
+++ b/src/gevent/testing/errorhandler.py
@@ -22,31 +22,36 @@ from functools import wraps


 def wrap_error_fatal(method):
-    import gevent
-    system_error = gevent.get_hub().SYSTEM_ERROR
+    from gevent._hub_local import get_hub_class
+    system_error = get_hub_class().SYSTEM_ERROR

    @wraps(method)
    def wrapper(self, *args, **kwargs):
        # XXX should also be able to do gevent.SYSTEM_ERROR = object
        # which is a global default to all hubs
-        gevent.get_hub().SYSTEM_ERROR = object
+        get_hub_class().SYSTEM_ERROR = object
        try:
            return method(self, *args, **kwargs)
        finally:
-            gevent.get_hub().SYSTEM_ERROR = system_error
+            get_hub_class().SYSTEM_ERROR = system_error
    return wrapper


 def wrap_restore_handle_error(method):
-    import gevent
-    old = gevent.get_hub().handle_error
+    from gevent._hub_local import get_hub_if_exists
+    from gevent import getcurrent

    @wraps(method)
    def wrapper(self, *args, **kwargs):
        try:
            return method(self, *args, **kwargs)
        finally:
-            gevent.get_hub().handle_error = old
+            # Remove any customized handle_error, if set on the
+            # instance.
+            try:
+                del get_hub_if_exists().handle_error
+            except AttributeError:
+                pass
        if self.peek_error()[0] is not None:
-            gevent.getcurrent().throw(*self.peek_error()[1:])
+            getcurrent().throw(*self.peek_error()[1:])
    return wrapper
--- a/src/gevent/tests/test__lock.py
+++ b/src/gevent/tests/test__lock.py
@@ -24,5 +24,10 @@ class TestRLockMultiThread(test__semaphore.TestSemaphoreMultiThread):
        # So we deliberately don't set the hub to help test that condition.
        return lock.RLock()

+    def assertOneHasNoHub(self, sem):
+        self.assertIsNone(sem._block.hub)
+
+
+
 if __name__ == '__main__':
    greentest.main()
--- a/src/gevent/tests/test__semaphore.py
+++ b/src/gevent/tests/test__semaphore.py
@@ -12,6 +12,7 @@ import weakref
 import gevent
 import gevent.exceptions
 from gevent.lock import Semaphore
+from gevent.lock import BoundedSemaphore

 import gevent.testing as greentest
 from gevent.testing import timing
@@ -74,12 +75,15 @@ class TestSemaphoreMultiThread(greentest.TestCase):

    # See https://github.com/gevent/gevent/issues/1437

+    def _getTargetClass(self):
+        return Semaphore
+
    def _makeOne(self):
        # Create an object that is associated with the current hub. If
        # we don't do this now, it gets initialized lazily the first
        # time it would have to block, which, in the event of threads,
        # would be from an arbitrary thread.
-        return Semaphore(1, gevent.get_hub())
+        return self._getTargetClass()(1)

    def _makeThreadMain(self, thread_running, thread_acquired, sem,
                        acquired, exc_info,
@@ -104,7 +108,12 @@ class TestSemaphoreMultiThread(greentest.TestCase):
                thread_acquired.set()
        return thread_main

-    def _do_test_acquire_in_one_then_another(self, release=True, **thread_acquire_kwargs):
+    IDLE_ITERATIONS = 5
+
+    def _do_test_acquire_in_one_then_another(self,
+                                             release=True,
+                                             require_thread_acquired_to_finish=False,
+                                             **thread_acquire_kwargs):
        from gevent import monkey
        self.assertFalse(monkey.is_module_patched('threading'))

@@ -124,6 +133,7 @@ class TestSemaphoreMultiThread(greentest.TestCase):
            acquired, exc_info,
            **thread_acquire_kwargs
        ))
+        t.daemon = True
        t.start()
        thread_running.wait(10) # implausibly large time
        if release:
@@ -136,14 +146,25 @@ class TestSemaphoreMultiThread(greentest.TestCase):
            # that get run (including time-based) the notifier may or
            # may not be immediately ready to run, so this can take up
            # to two iterations.)
-            for _ in range(3):
+            for _ in range(self.IDLE_ITERATIONS):
                gevent.idle()
                if thread_acquired.wait(timing.LARGE_TICK):
                    break

            self.assertEqual(acquired, [True])

+        if not release and thread_acquire_kwargs.get("timeout"):
+            # Spin the loop to be sure that the timeout has a chance to
+            # process. Interleave this with something that drops the GIL
+            # so the background thread has a chance to notice that.
+            for _ in range(self.IDLE_ITERATIONS):
+                gevent.idle()
+                if thread_acquired.wait(timing.LARGE_TICK):
+                    break
        thread_acquired.wait(timing.LARGE_TICK * 5)
+
+        if require_thread_acquired_to_finish:
+            self.assertTrue(thread_acquired.is_set())
        try:
            self.assertEqual(exc_info, [])
        finally:
@@ -157,6 +178,7 @@ class TestSemaphoreMultiThread(greentest.TestCase):
    def test_acquire_in_one_then_another_timed(self):
        sem, acquired_in_thread = self._do_test_acquire_in_one_then_another(
            release=False,
+            require_thread_acquired_to_finish=True,
            timeout=timing.SMALLEST_RELIABLE_DELAY)
        self.assertEqual([False], acquired_in_thread)
        # This doesn't, of course, notify anything, because
@@ -177,7 +199,6 @@ class TestSemaphoreMultiThread(greentest.TestCase):

        import threading

-
        sem = self._makeOne()
        # Make future acquires block
        sem.acquire()
@@ -193,8 +214,6 @@ class TestSemaphoreMultiThread(greentest.TestCase):
        exc_info = []
        acquired = []

-
-
        glet = gevent.spawn(greenlet_one)
        thread = threading.Thread(target=self._makeThreadMain(
            threading.Event(), threading.Event(),
@@ -202,19 +221,112 @@ class TestSemaphoreMultiThread(greentest.TestCase):
            acquired, exc_info,
            timeout=timing.LARGE_TICK
        ))
+        thread.daemon = True
        gevent.idle()
        sem.release()
        glet.join()
-        thread.join(timing.LARGE_TICK)
+        for _ in range(3):
+            gevent.idle()
+            thread.join(timing.LARGE_TICK)

        self.assertEqual(glet.value, True)
        self.assertEqual([], exc_info)
        self.assertEqual([False], acquired)
+        self.assertTrue(glet.dead, glet)
+        glet = None
+
+    def assertOneHasNoHub(self, sem):
+        self.assertIsNone(sem.hub, sem)
+
+    @greentest.skipOnPyPyOnWindows("Flaky there; can't reproduce elsewhere")
+    def test_dueling_threads(self, acquire_args=(), create_hub=None):
+        # pylint:disable=too-many-locals,too-many-statements
+
+        # Threads doing nothing but acquiring and releasing locks, without
+        # having any other greenlets to switch to.
+        # https://github.com/gevent/gevent/issues/1698
+        from gevent import monkey
+        from gevent._hub_local import get_hub_if_exists
+
+        self.assertFalse(monkey.is_module_patched('threading'))
+
+        import threading
+        from time import sleep as native_sleep
+
+        sem = self._makeOne()
+        self.assertOneHasNoHub(sem)
+        count = 10000
+        results = [-1, -1]
+        run = True
+        def do_it(ix):
+            if create_hub:
+                gevent.get_hub()
+
+            try:
+                for i in range(count):
+                    if not run:
+                        break
+
+                    sem.acquire(*acquire_args)
+                    sem.release()
+                    results[ix] = i
+                    if not create_hub:
+                        # We don't artificially create the hub.
+                        self.assertIsNone(
+                            get_hub_if_exists(),
+                            (get_hub_if_exists(), ix, i)
+                        )
+                    if create_hub and i % 10 == 0:
+                        gevent.sleep(timing.SMALLEST_RELIABLE_DELAY)
+                    elif i % 100 == 0:
+                        native_sleep(timing.SMALLEST_RELIABLE_DELAY)
+            except Exception as ex: # pylint:disable=broad-except
+                import traceback; traceback.print_exc()
+                results[ix] = str(ex)
+                ex = None
+            finally:
+                hub = get_hub_if_exists()
+                if hub is not None:
+                    hub.join()
+                    hub.destroy(destroy_loop=True)
+
+        t1 = threading.Thread(target=do_it, args=(0,))
+        t1.daemon = True
+        t2 = threading.Thread(target=do_it, args=(1,))
+        t2.daemon = True
+        t1.start()
+        t2.start()
+
+        t1.join(1)
+        t2.join(1)
+
+        while t1.is_alive() or t2.is_alive():
+            cur = list(results)
+            t1.join(7)
+            t2.join(7)
+            if cur == results:
+                # Hmm, after two seconds, no progress
+                run = False
+                break
+
+        self.assertEqual(results, [count - 1, count - 1])
+
+    def test_dueling_threads_timeout(self):
+        self.test_dueling_threads((True, 4))
+
+    def test_dueling_threads_with_hub(self):
+        self.test_dueling_threads(create_hub=True)
+

    # XXX: Need a test with multiple greenlets in a non-primary
    # thread. Things should work, just very slowly; instead of moving through
    # greenlet.switch(), they'll be moving with async watchers.

+class TestBoundedSemaphoreMultiThread(TestSemaphoreMultiThread):
+
+    def _getTargetClass(self):
+        return BoundedSemaphore
+
 @greentest.skipOnPurePython("Needs C extension")
 class TestCExt(greentest.TestCase):


--- a/src/gevent/thread.py
+++ b/src/gevent/thread.py
@@ -60,6 +60,7 @@ from gevent._hub_local import get_hub_if_exists
 from gevent.greenlet import Greenlet
 from gevent.lock import BoundedSemaphore
 from gevent.local import local as _local
+from gevent.exceptions import LoopExit

 if hasattr(__thread__, 'RLock'):
    assert PY3 or PYPY
@@ -115,7 +116,18 @@ class LockType(BoundedSemaphore):
            if timeout > self._TIMEOUT_MAX:
                raise OverflowError('timeout value is too large')

-        acquired = BoundedSemaphore.acquire(self, blocking, timeout)
+
+        try:
+            acquired = BoundedSemaphore.acquire(self, blocking, timeout)
+        except LoopExit:
+            # Raised when the semaphore was not trivially ours, and we needed
+            # to block. Some other thread presumably owns the semaphore, and there are no greenlets
+            # running in this thread to switch to. So the best we can do is
+            # release the GIL and try again later.
+            if blocking: # pragma: no cover
+                raise
+            acquired = False
+
        if not acquired and not blocking and getcurrent() is not get_hub_if_exists():
            # Run other callbacks. This makes spin locks works.
            # We can't do this if we're in the hub, which we could easily be:

--- a/src/gevent/util.py
+++ b/src/gevent/util.py
@@ -162,7 +162,6 @@ def _format_thread_info(lines, thread_stacks, limit, current_thread_ident):
    import threading

    threads = {th.ident: th for th in threading.enumerate()}
-
    lines.append('*' * 80)
    lines.append('* Threads')