Commit d8b42b3b authored by Jason Madden's avatar Jason Madden

Make socket.sendall() fast on PyPy3: 60MB/s -> 600MB/s

Share the chunking code between Python 2 and 3.

Also get the tests passing (read: skip a bunch) with 5.10.1 on OS X,
which is apparently less well tested than Linux.

We don't test pypy3 on appveyor, so:

[skip appveyor]
parent 460d78a7
......@@ -69,6 +69,9 @@
it will regenerate itself. The default loop is the only one that can
receive child events.
- Make :meth:`gevent.socket.socket.sendall` up to ten times faster on
PyPy3, through the same change that was applied in gevent 1.1b3 for PyPy2.
1.3a1 (2018-01-27)
......@@ -7,7 +7,6 @@ from __future__ import absolute_import
# Our import magic sadly makes this warning useless
# pylint: disable=undefined-variable
import time
from gevent import _socketcommon
from gevent._util import copy_globals
from gevent._compat import PYPY
......@@ -362,91 +361,13 @@ class socket(object):
return 0
def __send_chunk(self, data_memory, flags, timeleft, end):
Send the complete contents of ``data_memory`` before returning.
This is the core loop around :meth:`send`.
:param timeleft: Either ``None`` if there is no timeout involved,
or a float indicating the timeout to use.
:param end: Either ``None`` if there is no timeout involved, or
a float giving the absolute end time.
:return: An updated value for ``timeleft`` (or None)
:raises timeout: If ``timeleft`` was given and elapsed while
sending this chunk.
data_sent = 0
len_data_memory = len(data_memory)
started_timer = 0
while data_sent < len_data_memory:
chunk = data_memory[data_sent:]
if timeleft is None:
data_sent += self.send(chunk, flags)
elif started_timer and timeleft <= 0:
# Check before sending to guarantee a check
# happens even if each chunk successfully sends its data
# (especially important for SSL sockets since they have large
# buffers). But only do this if we've actually tried to
# send something once to avoid spurious timeouts on non-blocking
# sockets.
raise timeout('timed out')
started_timer = 1
data_sent += self.send(chunk, flags, timeout=timeleft)
timeleft = end - time.time()
return timeleft
def sendall(self, data, flags=0):
if isinstance(data, unicode):
data = data.encode()
# this sendall is also reused by gevent.ssl.SSLSocket subclass,
# so it should not call self._sock methods directly
data_memory = _get_memory(data)
len_data_memory = len(data_memory)
if not len_data_memory:
# Don't send empty data, can cause SSL EOFError.
# See issue 719
return 0
# On PyPy up through 2.6.0, subviews of a memoryview() object
# copy the underlying bytes the first time the builtin
# socket.send() method is called. On a non-blocking socket
# (that thus calls socket.send() many times) with a large
# input, this results in many repeated copies of an ever
# smaller string, depending on the networking buffering. For
# example, if each send() can process 1MB of a 50MB input, and
# we naively pass the entire remaining subview each time, we'd
# copy 49MB, 48MB, 47MB, etc, thus completely killing
# performance. To workaround this problem, we work in
# reasonable, fixed-size chunks. This results in a 10x
# improvement to, while having no measurable impact on
# CPython (since it doesn't copy at all the only extra overhead is
# a few python function calls, which is negligible for large inputs).
# See
# Too small of a chunk (the socket's buf size is usually too
# small) results in reduced perf due to *too many* calls to send and too many
# small copies. With a buffer of 143K (the default on my system), for
# example, yields ~264MB/s, while using 1MB yields
# ~653MB/s (matching CPython). 1MB is arbitrary and might be better
# chosen, say, to match a page size?
chunk_size = max(self.getsockopt(SOL_SOCKET, SO_SNDBUF), 1024 * 1024) # pylint:disable=no-member
data_sent = 0
end = None
timeleft = None
if self.timeout is not None:
timeleft = self.timeout
end = time.time() + timeleft
while data_sent < len_data_memory:
chunk_end = min(data_sent + chunk_size, len_data_memory)
chunk = data_memory[data_sent:chunk_end]
timeleft = self.__send_chunk(chunk, flags, timeleft, end)
data_sent += len(chunk) # Guaranteed it sent the whole thing
return _socketcommon._sendall(self, data_memory, flags)
def sendto(self, *args):
sock = self._sock
......@@ -370,8 +370,9 @@ class socket(object):
if hasattr(_socket.socket, 'sendmsg'):
# Only on Unix
if hasattr(_socket.socket, 'recvmsg'):
# Only on Unix; PyPy 3.5 5.10.0 provides sendmsg and recvmsg, but not
# recvmsg_into (at least on os x)
def recvmsg(self, *args):
while True:
......@@ -382,6 +383,8 @@ class socket(object):
if hasattr(_socket.socket, 'recvmsg_into'):
def recvmsg_into(self, *args):
while True:
......@@ -441,27 +444,7 @@ class socket(object):
# PyPy2, so it's possibly premature to do this. However, there is a 3.5 test case that
# possibly exposes this in a severe way.
data_memory = _get_memory(data)
len_data_memory = len(data_memory)
if not len_data_memory:
# Don't try to send empty data at all, no point, and breaks ssl
# See issue 719
return 0
if self.timeout is None:
data_sent = 0
while data_sent < len_data_memory:
data_sent += self.send(data_memory[data_sent:], flags)
timeleft = self.timeout
end = time.time() + timeleft
data_sent = 0
while True:
data_sent += self.send(data_memory[data_sent:], flags, timeout=timeleft)
if data_sent >= len_data_memory:
timeleft = end - time.time()
if timeleft <= 0:
raise timeout('timed out')
return _socketcommon._sendall(self, data_memory, flags)
def sendto(self, *args):
......@@ -68,7 +68,7 @@ __py3_imports__ = [
import time
import sys
from gevent.hub import get_hub
from gevent.hub import ConcurrentObjectUseError
......@@ -349,3 +349,96 @@ def getfqdn(name=''):
name = hostname
return name
def __send_chunk(socket, data_memory, flags, timeleft, end, timeout=_timeout_error):
Send the complete contents of ``data_memory`` before returning.
This is the core loop around :meth:`send`.
:param timeleft: Either ``None`` if there is no timeout involved,
or a float indicating the timeout to use.
:param end: Either ``None`` if there is no timeout involved, or
a float giving the absolute end time.
:return: An updated value for ``timeleft`` (or None)
:raises timeout: If ``timeleft`` was given and elapsed while
sending this chunk.
data_sent = 0
len_data_memory = len(data_memory)
started_timer = 0
while data_sent < len_data_memory:
chunk = data_memory[data_sent:]
if timeleft is None:
data_sent += socket.send(chunk, flags)
elif started_timer and timeleft <= 0:
# Check before sending to guarantee a check
# happens even if each chunk successfully sends its data
# (especially important for SSL sockets since they have large
# buffers). But only do this if we've actually tried to
# send something once to avoid spurious timeouts on non-blocking
# sockets.
raise timeout('timed out')
started_timer = 1
data_sent += socket.send(chunk, flags, timeout=timeleft)
timeleft = end - time.time()
return timeleft
def _sendall(socket, data_memory, flags,
SOL_SOCKET=__socket__.SOL_SOCKET, # pylint:disable=no-member
SO_SNDBUF=__socket__.SO_SNDBUF): # pylint:disable=no-member
Send the *data_memory* (which should be a memoryview)
using the gevent *socket*, performing well on PyPy.
# On PyPy up through 5.10.0, both PyPy2 and PyPy3, subviews
# (slices) of a memoryview() object copy the underlying bytes the
# first time the builtin socket.send() method is called. On a
# non-blocking socket (that thus calls socket.send() many times)
# with a large input, this results in many repeated copies of an
# ever smaller string, depending on the networking buffering. For
# example, if each send() can process 1MB of a 50MB input, and we
# naively pass the entire remaining subview each time, we'd copy
# 49MB, 48MB, 47MB, etc, thus completely killing performance. To
# workaround this problem, we work in reasonable, fixed-size
# chunks. This results in a 10x improvement to,
# while having no measurable impact on CPython (since it doesn't
# copy at all the only extra overhead is a few python function
# calls, which is negligible for large inputs).
# On one macOS machine, PyPy3 5.10.1 produced ~ 67.53 MB/s before this change,
# and ~ 616.01 MB/s after.
# See
# Too small of a chunk (the socket's buf size is usually too
# small) results in reduced perf due to *too many* calls to send and too many
# small copies. With a buffer of 143K (the default on my system), for
# example, yields ~264MB/s, while using 1MB yields
# ~653MB/s (matching CPython). 1MB is arbitrary and might be better
# chosen, say, to match a page size?
len_data_memory = len(data_memory)
if not len_data_memory:
# Don't try to send empty data at all, no point, and breaks ssl
# See issue 719
return 0
chunk_size = max(socket.getsockopt(SOL_SOCKET, SO_SNDBUF), 1024 * 1024)
data_sent = 0
end = None
timeleft = None
if socket.timeout is not None:
timeleft = socket.timeout
end = time.time() + timeleft
while data_sent < len_data_memory:
chunk_end = min(data_sent + chunk_size, len_data_memory)
chunk = data_memory[data_sent:chunk_end]
timeleft = __send_chunk(socket, chunk, flags, timeleft, end)
data_sent += len(chunk) # Guaranteed it sent the whole thing
......@@ -630,7 +630,7 @@ if PYPY3:
if PYPY and sys.pypy_version_info[:4] in ( # pylint:disable=no-member
(5, 8, 0, 'beta'), (5, 9, 0, 'beta'),):
(5, 8, 0, 'beta'), (5, 9, 0, 'beta'), (5, 10, 1, 'final')):
# 3.5 is beta. Hard to say what are real bugs in us vs real bugs in pypy.
# For that reason, we pin these patches exactly to the version in use.
......@@ -653,6 +653,35 @@ if PYPY and sys.pypy_version_info[:4] in ( # pylint:disable=no-member
# The below are new with 5.10.1
# These fail with 'OSError: received malformed or improperly truncated ancillary data'
# Using the provided High Sierra binary, these fail with
# 'ValueError: invalid protocol version _SSLMethod.PROTOCOL_SSLv3'.
# gevent code isn't involved and running them unpatched has the same issue.
# This gets an EOF in violation of protocol; again, even without gevent
# This gets None instead of http1.1, even without gevent
# This fails to decode a filename even without gevent,
# at least on High Sierarr.
disabled_tests += [
......@@ -516,7 +516,7 @@ class ThreadJoinOnShutdown(unittest.TestCase):
w = threading.Thread(target=worker)
import sys
if sys.version_info[:2] >= (3, 7) or (sys.version_info[:2] >= (3, 5) and hasattr(sys, 'pypy_version_info')):
if sys.version_info[:2] >= (3, 7) or (sys.version_info[:2] >= (3, 5) and hasattr(sys, 'pypy_version_info') and sys.platform != 'darwin'):
# In PyPy3 5.8.0, if we don't wait on this top-level "thread", 'w',
......@@ -526,6 +526,8 @@ class ThreadJoinOnShutdown(unittest.TestCase):
# the interpreter waiting on thread locks, like the issue described in
# for Python 3.4? in any case, it doesn't hang in Python 2.) This changed in
# 3.7a1 and waiting on it is again necessary and doesn't hang.
# PyPy3 5.10.1 is back to the "old" cpython behaviour, and waiting on it
# causes the whole process to hang, but apparently only on OS X---linux was fine without it
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment