Commit d8b42b3b authored by Jason Madden's avatar Jason Madden

Make socket.sendall() fast on PyPy3: 60MB/s -> 600MB/s

Share the chunking code between Python 2 and 3.

Also get the tests passing (read: skip a bunch) with 5.10.1 on OS X,
which is apparently less well tested than Linux.

We don't test pypy3 on appveyor, so:

[skip appveyor]
parent 460d78a7
...@@ -69,6 +69,9 @@ ...@@ -69,6 +69,9 @@
it will regenerate itself. The default loop is the only one that can it will regenerate itself. The default loop is the only one that can
receive child events. receive child events.
- Make :meth:`gevent.socket.socket.sendall` up to ten times faster on
PyPy3, through the same change that was applied in gevent 1.1b3 for PyPy2.
1.3a1 (2018-01-27) 1.3a1 (2018-01-27)
================== ==================
......
...@@ -7,7 +7,6 @@ from __future__ import absolute_import ...@@ -7,7 +7,6 @@ from __future__ import absolute_import
# Our import magic sadly makes this warning useless # Our import magic sadly makes this warning useless
# pylint: disable=undefined-variable # pylint: disable=undefined-variable
import time
from gevent import _socketcommon from gevent import _socketcommon
from gevent._util import copy_globals from gevent._util import copy_globals
from gevent._compat import PYPY from gevent._compat import PYPY
...@@ -362,91 +361,13 @@ class socket(object): ...@@ -362,91 +361,13 @@ class socket(object):
return 0 return 0
raise raise
def __send_chunk(self, data_memory, flags, timeleft, end):
"""
Send the complete contents of ``data_memory`` before returning.
This is the core loop around :meth:`send`.
:param timeleft: Either ``None`` if there is no timeout involved,
or a float indicating the timeout to use.
:param end: Either ``None`` if there is no timeout involved, or
a float giving the absolute end time.
:return: An updated value for ``timeleft`` (or None)
:raises timeout: If ``timeleft`` was given and elapsed while
sending this chunk.
"""
data_sent = 0
len_data_memory = len(data_memory)
started_timer = 0
while data_sent < len_data_memory:
chunk = data_memory[data_sent:]
if timeleft is None:
data_sent += self.send(chunk, flags)
elif started_timer and timeleft <= 0:
# Check before sending to guarantee a check
# happens even if each chunk successfully sends its data
# (especially important for SSL sockets since they have large
# buffers). But only do this if we've actually tried to
# send something once to avoid spurious timeouts on non-blocking
# sockets.
raise timeout('timed out')
else:
started_timer = 1
data_sent += self.send(chunk, flags, timeout=timeleft)
timeleft = end - time.time()
return timeleft
def sendall(self, data, flags=0): def sendall(self, data, flags=0):
if isinstance(data, unicode): if isinstance(data, unicode):
data = data.encode() data = data.encode()
# this sendall is also reused by gevent.ssl.SSLSocket subclass, # this sendall is also reused by gevent.ssl.SSLSocket subclass,
# so it should not call self._sock methods directly # so it should not call self._sock methods directly
data_memory = _get_memory(data) data_memory = _get_memory(data)
len_data_memory = len(data_memory) return _socketcommon._sendall(self, data_memory, flags)
if not len_data_memory:
# Don't send empty data, can cause SSL EOFError.
# See issue 719
return 0
# On PyPy up through 2.6.0, subviews of a memoryview() object
# copy the underlying bytes the first time the builtin
# socket.send() method is called. On a non-blocking socket
# (that thus calls socket.send() many times) with a large
# input, this results in many repeated copies of an ever
# smaller string, depending on the networking buffering. For
# example, if each send() can process 1MB of a 50MB input, and
# we naively pass the entire remaining subview each time, we'd
# copy 49MB, 48MB, 47MB, etc, thus completely killing
# performance. To workaround this problem, we work in
# reasonable, fixed-size chunks. This results in a 10x
# improvement to bench_sendall.py, while having no measurable impact on
# CPython (since it doesn't copy at all the only extra overhead is
# a few python function calls, which is negligible for large inputs).
# See https://bitbucket.org/pypy/pypy/issues/2091/non-blocking-socketsend-slow-gevent
# Too small of a chunk (the socket's buf size is usually too
# small) results in reduced perf due to *too many* calls to send and too many
# small copies. With a buffer of 143K (the default on my system), for
# example, bench_sendall.py yields ~264MB/s, while using 1MB yields
# ~653MB/s (matching CPython). 1MB is arbitrary and might be better
# chosen, say, to match a page size?
chunk_size = max(self.getsockopt(SOL_SOCKET, SO_SNDBUF), 1024 * 1024) # pylint:disable=no-member
data_sent = 0
end = None
timeleft = None
if self.timeout is not None:
timeleft = self.timeout
end = time.time() + timeleft
while data_sent < len_data_memory:
chunk_end = min(data_sent + chunk_size, len_data_memory)
chunk = data_memory[data_sent:chunk_end]
timeleft = self.__send_chunk(chunk, flags, timeleft, end)
data_sent += len(chunk) # Guaranteed it sent the whole thing
def sendto(self, *args): def sendto(self, *args):
sock = self._sock sock = self._sock
......
...@@ -370,8 +370,9 @@ class socket(object): ...@@ -370,8 +370,9 @@ class socket(object):
raise raise
self._wait(self._read_event) self._wait(self._read_event)
if hasattr(_socket.socket, 'sendmsg'): if hasattr(_socket.socket, 'recvmsg'):
# Only on Unix # Only on Unix; PyPy 3.5 5.10.0 provides sendmsg and recvmsg, but not
# recvmsg_into (at least on os x)
def recvmsg(self, *args): def recvmsg(self, *args):
while True: while True:
...@@ -382,6 +383,8 @@ class socket(object): ...@@ -382,6 +383,8 @@ class socket(object):
raise raise
self._wait(self._read_event) self._wait(self._read_event)
if hasattr(_socket.socket, 'recvmsg_into'):
def recvmsg_into(self, *args): def recvmsg_into(self, *args):
while True: while True:
try: try:
...@@ -441,27 +444,7 @@ class socket(object): ...@@ -441,27 +444,7 @@ class socket(object):
# PyPy2, so it's possibly premature to do this. However, there is a 3.5 test case that # PyPy2, so it's possibly premature to do this. However, there is a 3.5 test case that
# possibly exposes this in a severe way. # possibly exposes this in a severe way.
data_memory = _get_memory(data) data_memory = _get_memory(data)
len_data_memory = len(data_memory) return _socketcommon._sendall(self, data_memory, flags)
if not len_data_memory:
# Don't try to send empty data at all, no point, and breaks ssl
# See issue 719
return 0
if self.timeout is None:
data_sent = 0
while data_sent < len_data_memory:
data_sent += self.send(data_memory[data_sent:], flags)
else:
timeleft = self.timeout
end = time.time() + timeleft
data_sent = 0
while True:
data_sent += self.send(data_memory[data_sent:], flags, timeout=timeleft)
if data_sent >= len_data_memory:
break
timeleft = end - time.time()
if timeleft <= 0:
raise timeout('timed out')
def sendto(self, *args): def sendto(self, *args):
try: try:
......
...@@ -68,7 +68,7 @@ __py3_imports__ = [ ...@@ -68,7 +68,7 @@ __py3_imports__ = [
__imports__.extend(__py3_imports__) __imports__.extend(__py3_imports__)
import time
import sys import sys
from gevent.hub import get_hub from gevent.hub import get_hub
from gevent.hub import ConcurrentObjectUseError from gevent.hub import ConcurrentObjectUseError
...@@ -349,3 +349,96 @@ def getfqdn(name=''): ...@@ -349,3 +349,96 @@ def getfqdn(name=''):
else: else:
name = hostname name = hostname
return name return name
def __send_chunk(socket, data_memory, flags, timeleft, end, timeout=_timeout_error):
"""
Send the complete contents of ``data_memory`` before returning.
This is the core loop around :meth:`send`.
:param timeleft: Either ``None`` if there is no timeout involved,
or a float indicating the timeout to use.
:param end: Either ``None`` if there is no timeout involved, or
a float giving the absolute end time.
:return: An updated value for ``timeleft`` (or None)
:raises timeout: If ``timeleft`` was given and elapsed while
sending this chunk.
"""
data_sent = 0
len_data_memory = len(data_memory)
started_timer = 0
while data_sent < len_data_memory:
chunk = data_memory[data_sent:]
if timeleft is None:
data_sent += socket.send(chunk, flags)
elif started_timer and timeleft <= 0:
# Check before sending to guarantee a check
# happens even if each chunk successfully sends its data
# (especially important for SSL sockets since they have large
# buffers). But only do this if we've actually tried to
# send something once to avoid spurious timeouts on non-blocking
# sockets.
raise timeout('timed out')
else:
started_timer = 1
data_sent += socket.send(chunk, flags, timeout=timeleft)
timeleft = end - time.time()
return timeleft
def _sendall(socket, data_memory, flags,
SOL_SOCKET=__socket__.SOL_SOCKET, # pylint:disable=no-member
SO_SNDBUF=__socket__.SO_SNDBUF): # pylint:disable=no-member
"""
Send the *data_memory* (which should be a memoryview)
using the gevent *socket*, performing well on PyPy.
"""
# On PyPy up through 5.10.0, both PyPy2 and PyPy3, subviews
# (slices) of a memoryview() object copy the underlying bytes the
# first time the builtin socket.send() method is called. On a
# non-blocking socket (that thus calls socket.send() many times)
# with a large input, this results in many repeated copies of an
# ever smaller string, depending on the networking buffering. For
# example, if each send() can process 1MB of a 50MB input, and we
# naively pass the entire remaining subview each time, we'd copy
# 49MB, 48MB, 47MB, etc, thus completely killing performance. To
# workaround this problem, we work in reasonable, fixed-size
# chunks. This results in a 10x improvement to bench_sendall.py,
# while having no measurable impact on CPython (since it doesn't
# copy at all the only extra overhead is a few python function
# calls, which is negligible for large inputs).
# On one macOS machine, PyPy3 5.10.1 produced ~ 67.53 MB/s before this change,
# and ~ 616.01 MB/s after.
# See https://bitbucket.org/pypy/pypy/issues/2091/non-blocking-socketsend-slow-gevent
# Too small of a chunk (the socket's buf size is usually too
# small) results in reduced perf due to *too many* calls to send and too many
# small copies. With a buffer of 143K (the default on my system), for
# example, bench_sendall.py yields ~264MB/s, while using 1MB yields
# ~653MB/s (matching CPython). 1MB is arbitrary and might be better
# chosen, say, to match a page size?
len_data_memory = len(data_memory)
if not len_data_memory:
# Don't try to send empty data at all, no point, and breaks ssl
# See issue 719
return 0
chunk_size = max(socket.getsockopt(SOL_SOCKET, SO_SNDBUF), 1024 * 1024)
data_sent = 0
end = None
timeleft = None
if socket.timeout is not None:
timeleft = socket.timeout
end = time.time() + timeleft
while data_sent < len_data_memory:
chunk_end = min(data_sent + chunk_size, len_data_memory)
chunk = data_memory[data_sent:chunk_end]
timeleft = __send_chunk(socket, chunk, flags, timeleft, end)
data_sent += len(chunk) # Guaranteed it sent the whole thing
...@@ -630,7 +630,7 @@ if PYPY3: ...@@ -630,7 +630,7 @@ if PYPY3:
if PYPY and sys.pypy_version_info[:4] in ( # pylint:disable=no-member if PYPY and sys.pypy_version_info[:4] in ( # pylint:disable=no-member
(5, 8, 0, 'beta'), (5, 9, 0, 'beta'),): (5, 8, 0, 'beta'), (5, 9, 0, 'beta'), (5, 10, 1, 'final')):
# 3.5 is beta. Hard to say what are real bugs in us vs real bugs in pypy. # 3.5 is beta. Hard to say what are real bugs in us vs real bugs in pypy.
# For that reason, we pin these patches exactly to the version in use. # For that reason, we pin these patches exactly to the version in use.
...@@ -653,6 +653,35 @@ if PYPY and sys.pypy_version_info[:4] in ( # pylint:disable=no-member ...@@ -653,6 +653,35 @@ if PYPY and sys.pypy_version_info[:4] in ( # pylint:disable=no-member
'test_subprocess.POSIXProcessTestCase.test_pass_fds', 'test_subprocess.POSIXProcessTestCase.test_pass_fds',
'test_subprocess.POSIXProcessTestCase.test_pass_fds_inheritable', 'test_subprocess.POSIXProcessTestCase.test_pass_fds_inheritable',
'test_subprocess.POSIXProcessTestCase.test_pipe_cloexec', 'test_subprocess.POSIXProcessTestCase.test_pipe_cloexec',
# The below are new with 5.10.1
# These fail with 'OSError: received malformed or improperly truncated ancillary data'
'test_socket.RecvmsgSCMRightsStreamTest.testCmsgTruncLen0',
'test_socket.RecvmsgSCMRightsStreamTest.testCmsgTruncLen0Plus1',
'test_socket.RecvmsgSCMRightsStreamTest.testCmsgTruncLen1',
'test_socket.RecvmsgSCMRightsStreamTest.testCmsgTruncLen2Minus1',
# Using the provided High Sierra binary, these fail with
# 'ValueError: invalid protocol version _SSLMethod.PROTOCOL_SSLv3'.
# gevent code isn't involved and running them unpatched has the same issue.
'test_ssl.ContextTests.test_constructor',
'test_ssl.ContextTests.test_protocol',
'test_ssl.ContextTests.test_session_stats',
'test_ssl.ThreadedTests.test_echo',
'test_ssl.ThreadedTests.test_protocol_sslv23',
'test_ssl.ThreadedTests.test_protocol_sslv3',
'test_ssl.ThreadedTests.test_protocol_tlsv1',
'test_ssl.ThreadedTests.test_protocol_tlsv1_1',
# This gets an EOF in violation of protocol; again, even without gevent
'test_ssl.NetworkedBIOTests.test_handshake',
# This gets None instead of http1.1, even without gevent
'test_ssl.ThreadedTests.test_npn_protocols',
# This fails to decode a filename even without gevent,
# at least on High Sierarr.
'test_httpservers.SimpleHTTPServerTestCase.test_undecodable_filename',
] ]
disabled_tests += [ disabled_tests += [
......
...@@ -516,7 +516,7 @@ class ThreadJoinOnShutdown(unittest.TestCase): ...@@ -516,7 +516,7 @@ class ThreadJoinOnShutdown(unittest.TestCase):
w = threading.Thread(target=worker) w = threading.Thread(target=worker)
w.start() w.start()
import sys import sys
if sys.version_info[:2] >= (3, 7) or (sys.version_info[:2] >= (3, 5) and hasattr(sys, 'pypy_version_info')): if sys.version_info[:2] >= (3, 7) or (sys.version_info[:2] >= (3, 5) and hasattr(sys, 'pypy_version_info') and sys.platform != 'darwin'):
w.join() w.join()
""" """
# In PyPy3 5.8.0, if we don't wait on this top-level "thread", 'w', # In PyPy3 5.8.0, if we don't wait on this top-level "thread", 'w',
...@@ -526,6 +526,8 @@ class ThreadJoinOnShutdown(unittest.TestCase): ...@@ -526,6 +526,8 @@ class ThreadJoinOnShutdown(unittest.TestCase):
# the interpreter waiting on thread locks, like the issue described in threading.py # the interpreter waiting on thread locks, like the issue described in threading.py
# for Python 3.4? in any case, it doesn't hang in Python 2.) This changed in # for Python 3.4? in any case, it doesn't hang in Python 2.) This changed in
# 3.7a1 and waiting on it is again necessary and doesn't hang. # 3.7a1 and waiting on it is again necessary and doesn't hang.
# PyPy3 5.10.1 is back to the "old" cpython behaviour, and waiting on it
# causes the whole process to hang, but apparently only on OS X---linux was fine without it
self._run_and_join(script) self._run_and_join(script)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment