Commit b2e6142c authored by Jason Madden's avatar Jason Madden

Fix an occasional crash in monkey patched 3.6/test_socket.py

First the crash report:

```
  testInterruptedRecvmsgTimeout (__main__.InterruptedRecvTimeoutTest) ... Traceback (most recent call last):
    File "/home/travis/build/gevent/gevent/src/gevent/_ffi/loop.py", line 83, in python_callback
      def python_callback(self, handle, revents):
    File "/home/travis/build/gevent/gevent/src/greentest/3.6/test_socket.py", line 3698, in <lambda>
      lambda signum, frame: 1 / 0)
  ZeroDivisionError: division by zero
  Fri Dec 15 16:09:36 2017
  ok
  testInterruptedSendTimeout (__main__.InterruptedSendTimeoutTest) ... Fatal Python error: ffi.from_handle() detected that the address passed points to garbage. If it is really the result of ffi.new_handle(), then the Python object has already been garbage collected

  Thread 0x00002b26d4c01700 (most recent call first):
    File "/home/travis/build/gevent/gevent/src/gevent/_threading.py", line 152 in wait
    File "/home/travis/build/gevent/gevent/src/gevent/_threading.py", line 436 in get
    File "/home/travis/build/gevent/gevent/src/gevent/threadpool.py", line 200 in _worker

  Thread 0x00002b26d4a00700 (most recent call first):
    File "/home/travis/build/gevent/gevent/src/gevent/_threading.py", line 152 in wait
    File "/home/travis/build/gevent/gevent/src/gevent/_threading.py", line 436 in get
    File "/home/travis/build/gevent/gevent/src/gevent/threadpool.py", line 200 in _worker

  Current thread 0x00002b26cc7b2a00 (most recent call first):
    File "/home/travis/build/gevent/gevent/src/gevent/_ffi/loop.py", line 178 in python_stop
    File "/home/travis/build/gevent/gevent/src/gevent/libuv/loop.py", line 289 in run
    File "/home/travis/build/gevent/gevent/src/gevent/hub.py", line 688 in run
```

The main problem turned out to be the way in which libuv reports
signals at random times, much worse than the way libev does. This led
to callback functions unexpectedly returning 0, because that's the
default for an unhandled exception or an onerror handler that returns
None. That in turn led to calls to python_stop that we weren't
expecting using a handle that had already been deallocated.

The fix is to both be defensive about what handles we try to use, and
to be more explicit about our error returns.
parent a0bda58b
......@@ -29,6 +29,12 @@
# useless-suppression: the only way to avoid repeating it for specific statements everywhere that we
# do Py2/Py3 stuff is to put it here. Sadly this means that we might get better but not realize it.
# duplicate-code: Yeah, the compatibility ssl modules are much the same
# In pylint 1.8.0, inconsistent-return-statements are created for the wrong reasons.
# This code raises it, even though there's only one return (the implicit 'return None' is presumably
# what triggers it):
# def foo():
# if baz:
# return 1
disable=wrong-import-position,
wrong-import-order,
missing-docstring,
......@@ -44,9 +50,10 @@ disable=wrong-import-position,
cyclic-import,
too-many-arguments,
redefined-builtin,
useless-suppression,
useless-suppression,
duplicate-code,
undefined-all-variable
undefined-all-variable,
inconsistent-return-statements
[FORMAT]
......
......@@ -2,7 +2,7 @@ setuptools
wheel
cython>=0.27.3
greenlet>=0.4.10
pylint>=1.7.1
pylint>=1.8.0
prospector[with_pyroma]
coverage>=4.0
coveralls>=1.0
......
......@@ -17,8 +17,17 @@ def _pid_dbg(*args, **kwargs):
print(os.getpid(), *args, **kwargs)
GEVENT_DEBUG = 0
CRITICAL = 1
ERROR = 3
DEBUG = 5
TRACE = 9
if os.getenv('GEVENT_DEBUG') == 'trace':
if os.getenv("GEVENT_DEBUG") == 'critical':
GEVENT_DEBUG = CRITICAL
elif os.getenv("GEVENT_DEBUG") == 'error':
GEVENT_DEBUG = ERROR
elif os.getenv('GEVENT_DEBUG') == 'debug':
GEVENT_DEBUG = DEBUG
elif os.getenv('GEVENT_DEBUG') == 'trace':
_dbg = _pid_dbg
GEVENT_DEBUG = TRACE
......@@ -11,6 +11,7 @@ from weakref import ref as WeakRef
from gevent._ffi import _dbg
from gevent._ffi import GEVENT_DEBUG
from gevent._ffi import TRACE
from gevent._ffi import CRITICAL
from gevent._ffi.callback import callback
__all__ = [
......@@ -62,23 +63,18 @@ EVENTS = GEVENT_CORE_EVENTS = _EVENTSType()
####
class _Callbacks(object):
if GEVENT_DEBUG >= TRACE:
def __init__(self, ffi):
self.ffi = ffi
self.callbacks = []
def from_handle(self, handle): # pylint:disable=method-hidden
_dbg("Getting from handle", handle)
traceback.print_stack()
x = self.ffi.from_handle(handle)
_dbg("Got from handle", handle, x)
return x
def __init__(self, ffi):
self.ffi = ffi
self.callbacks = []
if GEVENT_DEBUG < TRACE:
self.from_handle = ffi.from_handle
else:
def __init__(self, ffi):
self.ffi = ffi
self.callbacks = []
self.from_handle = self.ffi.from_handle
def from_handle(self, handle): # pylint:disable=method-hidden
_dbg("Getting from handle", handle)
x = self.ffi.from_handle(handle)
_dbg("Got from handle", handle, x)
return x
def python_callback(self, handle, revents):
"""
......@@ -88,21 +84,33 @@ class _Callbacks(object):
An exception occurred during the callback and you must call
:func:`_python_handle_error` to deal with it. The Python watcher
object will have the exception tuple saved in ``_exc_info``.
- 0
Everything went according to plan. You should check to see if the libev
watcher is still active, and call :func:`_python_stop` if so. This will
clean up the memory.
- 1
Everything went according to plan. You should check to see if the libev
watcher is still active, and call :func:`python_stop` if it is not. This will
clean up the memory. Finding the watcher still active at the event loop level,
but not having stopped itself at the gevent level is a buggy scenario and
shouldn't happen.
- 2
Everything went according to plan, but the watcher has already
been stopped. Its memory may no longer be valid.
This function should never return 0, as that's the default value that
Python exceptions will produce.
"""
orig_ffi_watcher = None
try:
# Even dereferencing the handle needs to be inside the try/except;
# if we don't return normally (e.g., a signal) then we wind up going
# to the 'onerror' handler, which
# to the 'onerror' handler (unhandled_onerror), which
# is not what we want; that can permanently wedge the loop depending
# on which callback was executing
# on which callback was executing.
# XXX: See comments in that function. We may be able to restart and do better?
if not handle:
# Hmm, a NULL handle. That's not supposed to happen.
# We can easily get into a loop if we deref it and allow that
# to raise.
_dbg("python_callback got null handle")
return 1
the_watcher = self.from_handle(handle)
orig_ffi_watcher = the_watcher._watcher
args = the_watcher.args
......@@ -114,8 +122,9 @@ class _Callbacks(object):
args = (revents, ) + args[1:]
the_watcher.callback(*args)
except: # pylint:disable=bare-except
_dbg("Got exception servicing watcher with handle", handle)
# It's possible for ``the_watcher`` to be undefined (UnboundLocalError)
# if we threw an exception on the line that created that variable.
# if we threw an exception (signal) on the line that created that variable.
# This is typically the case with a signal under libuv
try:
the_watcher
......@@ -132,11 +141,17 @@ class _Callbacks(object):
# It didn't stop itself, *and* it didn't stop itself, reset
# its watcher, and start itself again. libuv's io watchers MAY
# do that.
_dbg("The watcher has not stopped itself", the_watcher)
return 0
return 1 # It stopped itself
# The normal, expected scenario when we find the watcher still
# in the keepaliveset is that it is still active at the event loop
# level, so we don't expect that python_stop gets called.
_dbg("The watcher has not stopped itself, possibly still active", the_watcher)
return 1
return 2 # it stopped itself
def python_handle_error(self, handle, _revents):
_dbg("Handling error for handle", handle)
if not handle:
return
try:
watcher = self.from_handle(handle)
exc_info = watcher._exc_info
......@@ -167,14 +182,44 @@ class _Callbacks(object):
def unhandled_onerror(self, t, v, tb):
# This is supposed to be called for signals, etc.
# This is the onerror= value for CFFI.
# If we return None, C will get a value of 0/NULL;
# if we raise, CFFI will print the exception and then
# return 0/NULL; (unless error= was configured)
# If things go as planned, we return the value that asks
# C to call back and check on if the watcher needs to be closed or
# not.
# XXX: TODO: Could this cause events to be lost? Maybe we need to return
# a value that causes the C loop to try the callback again?
# at least for signals under libuv, which are delivered at very odd times.
# Hopefully the event still shows up when we poll the next time.
watcher = None
if tb is not None:
handle = tb.tb_frame.f_locals['handle']
watcher = self.from_handle(handle)
if handle: # handle could be NULL
watcher = self.from_handle(handle)
if watcher is not None:
watcher.loop._check_callback_handle_error(t, v, tb)
return 1
else:
raise v
def python_stop(self, handle):
if not handle:
print(
"WARNING: gevent: Unable to dereference handle; not stopping watcher. "
"Native resources may leak. This is most likely a bug in gevent.",
file=sys.stderr)
# The alternative is to crash with no helpful information
# NOTE: Raising exceptions here does nothing, they're swallowed by CFFI.
# Since the C level passed in a null pointer, even dereferencing the handle
# will just produce some exceptions.
if GEVENT_DEBUG < CRITICAL:
return
import pdb; pdb.set_trace()
_dbg("python_stop: stopping watcher with handle", handle)
watcher = self.from_handle(handle)
watcher.stop()
......@@ -512,7 +557,7 @@ class AbstractLoop(object):
def fork(self, ref=True, priority=None):
return self._watchers.fork(self, ref, priority)
def async(self, ref=True, priority=None):
def async(self, ref=True, priority=None): # XXX: Yeah, we know. pylint:disable=assign-to-new-keyword
return self._watchers.async(self, ref, priority)
if sys.platform != "win32":
......
......@@ -30,7 +30,7 @@ static void _gevent_generic_callback(struct ev_loop* loop,
// and allowing memory to be freed
python_handle_error(handle, revents);
break;
case 0:
case 1:
// Code to stop the event. Note that if python_callback
// has disposed of the last reference to the handle,
// `watcher` could now be invalid/disposed memory!
......@@ -38,8 +38,16 @@ static void _gevent_generic_callback(struct ev_loop* loop,
python_stop(handle);
}
break;
default:
assert(cb_result == 1);
case 2:
// watcher is already stopped and dead, nothing to do.
break;
default:
fprintf(stderr,
"WARNING: gevent: Unexpected return value %d from Python callback "
"for watcher %p and handle %d\n",
cb_result,
watcher, handle);
// XXX: Possible leaking of resources here? Should we be
// closing the watcher?
}
}
......@@ -11,7 +11,9 @@ static void (*gevent_noop)(void* handle) = &_gevent_noop;
static void _gevent_generic_callback1(void* vwatcher, int arg)
{
const uv_handle_t* watcher = (uv_handle_t*)vwatcher;
const void* handle = watcher->data; // Python code may set this to NULL.
// Python code may set this to NULL or even change it
// out from under us, which would tend to break things.
void* handle = watcher->data;
const int cb_result = python_callback(handle, arg);
switch(cb_result) {
case -1:
......@@ -20,62 +22,89 @@ static void _gevent_generic_callback1(void* vwatcher, int arg)
// and allowing memory to be freed
python_handle_error(handle, arg);
break;
case 0:
// Code to stop the event. Note that if python_callback
case 1:
// Code to stop the event IF NEEDED. Note that if python_callback
// has disposed of the last reference to the handle,
// `watcher` could now be invalid/disposed memory!
if (!uv_is_active(watcher)) {
python_stop(handle);
if (watcher->data != handle) {
if (watcher->data) {
// If Python set the data to NULL, then they
// expected to be stopped. That's fine.
// Otherwise, something weird happened.
fprintf(stderr,
"WARNING: gevent: watcher handle changed in callback "
"from %p to %p for watcher at %p of type %d\n",
handle, watcher->data, watcher, watcher->type);
// There's a very good chance that the object the
// handle referred to has been changed and/or the
// old handle has been deallocated (most common), so
// passing the old handle will crash. Instead we
// pass a sigil to let python distinguish this case.
python_stop(NULL);
}
}
else {
python_stop(handle);
}
}
break;
default:
assert(cb_result == 1);
case 2:
// watcher is already stopped and dead, nothing to do.
break;
default:
fprintf(stderr,
"WARNING: gevent: Unexpected return value %d from Python callback "
"for watcher %p (of type %d) and handle %p\n",
cb_result,
watcher, watcher->type, handle);
// XXX: Possible leaking of resources here? Should we be
// closing the watcher?
}
}
static void _gevent_generic_callback0(void* handle)
{
_gevent_generic_callback1(handle, 0);
_gevent_generic_callback1(handle, 0);
}
static void _gevent_poll_callback2(void* handle, int status, int events)
{
_gevent_generic_callback1(handle, status < 0 ? status : events);
_gevent_generic_callback1(handle, status < 0 ? status : events);
}
static void _gevent_fs_event_callback3(void* handle, const char* filename, int events, int status)
{
_gevent_generic_callback1(handle, status < 0 ? status : events);
_gevent_generic_callback1(handle, status < 0 ? status : events);
}
typedef struct _gevent_fs_poll_s {
uv_fs_poll_t handle;
uv_stat_t curr;
uv_stat_t prev;
uv_fs_poll_t handle;
uv_stat_t curr;
uv_stat_t prev;
} gevent_fs_poll_t;
static void _gevent_fs_poll_callback3(void* handlep, int status, const uv_stat_t* prev, const uv_stat_t* curr)
{
// stat pointers are valid for this callback only.
// if given, copy them into our structure, where they can be reached
// from python, just like libev's watcher does, before calling
// the callback.
// stat pointers are valid for this callback only.
// if given, copy them into our structure, where they can be reached
// from python, just like libev's watcher does, before calling
// the callback.
// The callback is invoked with status < 0 if path does not exist
// or is inaccessible. The watcher is not stopped but your
// callback is not called again until something changes (e.g. when
// the file is created or the error reason changes).
// In that case the fields will be 0 in curr/prev.
// The callback is invoked with status < 0 if path does not exist
// or is inaccessible. The watcher is not stopped but your
// callback is not called again until something changes (e.g. when
// the file is created or the error reason changes).
// In that case the fields will be 0 in curr/prev.
gevent_fs_poll_t* handle = (gevent_fs_poll_t*)handlep;
assert(status == 0);
gevent_fs_poll_t* handle = (gevent_fs_poll_t*)handlep;
assert(status == 0);
handle->curr = *curr;
handle->prev = *prev;
handle->curr = *curr;
handle->prev = *prev;
_gevent_generic_callback1((uv_handle_t*)handle, 0);
_gevent_generic_callback1((uv_handle_t*)handle, 0);
}
......@@ -372,7 +372,7 @@ class io(_base.IoMixin, watcher):
if w is not None and w.callback is not None:
# There's still a reference to it, and it's started,
# so we can't stop.
continue
return
# If we get here, nothing was started
# so we can take ourself out of the polling set
self.stop()
......@@ -523,7 +523,7 @@ class child(_SimulatedWithAsyncMixin,
self._async.send()
class async(_base.AsyncMixin, watcher):
class async(_base.AsyncMixin, watcher): # XXX: Yeah, we know pylint:disable=assign-to-new-keyword
def _watcher_ffi_init(self, args):
# It's dangerous to have a raw, non-initted struct
......
......@@ -307,6 +307,9 @@ def main():
if 'PYTHONFAULTHANDLER' not in os.environ:
os.environ['PYTHONFAULTHANDLER'] = 'true'
if 'GEVENT_DEBUG' not in os.environ:
os.environ['GEVENT_DEBUG'] = 'error'
tests = discover(options.tests, options.ignore, coverage)
if options.discover:
for cmd, options in tests:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment