Commit c25e68bc authored by Julien Muchembled's avatar Julien Muchembled

storage: speed up replication by sending bigger network packets

parent 96aeb716
......@@ -305,6 +305,7 @@ class Connection(BaseConnection):
# XXX: rename isPending, hasPendingMessages & pending methods
buffering = False
connecting = True
client = False
server = False
......@@ -541,8 +542,16 @@ class Connection(BaseConnection):
def _addPacket(self, packet):
"""Add a packet into the write buffer."""
if self.connector.queue(packet.encode()):
if packet.nodelay or 65536 < self.connector.queue_size:
assert not self.buffering
# enable polling for writing.
self.em.addWriter(self)
else:
self.buffering = True
elif self.buffering and (65536 < self.connector.queue_size
or packet.nodelay):
self.buffering = False
self.em.addWriter(self)
logging.packet(self, packet, True)
def send(self, packet, msg_id=None):
......
......@@ -72,11 +72,13 @@ class SocketConnector(object):
# disable Nagle algorithm to reduce latency
s.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
self.queued = [ENCODED_VERSION]
self.queue_size = len(ENCODED_VERSION)
return self
def queue(self, data):
was_empty = not self.queued
self.queued += data
self.queue_size += len(data)
return was_empty
def _error(self, op, exc=None):
......@@ -183,8 +185,10 @@ class SocketConnector(object):
# could be sent.
if n != len(msg):
self.queued[:] = msg[n:],
self.queue_size -= n
return False
del self.queued[:]
self.queue_size = 0
else:
assert not self.queued
return True
......
......@@ -224,6 +224,7 @@ class Packet(object):
_code = None
_fmt = None
_id = None
nodelay = True
poll_thread = False
def __init__(self, *args):
......@@ -1441,6 +1442,8 @@ class AddTransaction(Packet):
"""
S -> S
"""
nodelay = False
_fmt = PStruct('add_transaction',
PTID('tid'),
PString('user'),
......@@ -1480,6 +1483,8 @@ class AddObject(Packet):
"""
S -> S
"""
nodelay = False
_fmt = PStruct('add_object',
POID('oid'),
PTID('serial'),
......
......@@ -254,9 +254,8 @@ class Application(BaseApplication):
while task_queue:
try:
while isIdle():
if task_queue[-1].next():
next(task_queue[-1]) or task_queue.rotate()
_poll(0)
task_queue.rotate()
break
except StopIteration:
task_queue.pop()
......@@ -270,10 +269,6 @@ class Application(BaseApplication):
self.replicator.stop()
def newTask(self, iterator):
try:
iterator.next()
except StopIteration:
return
self.task_queue.appendleft(iterator)
def closeClient(self, connection):
......
......@@ -388,7 +388,7 @@ class ImporterDatabaseManager(DatabaseManager):
finish()
txn = z.transaction
tid = txn.tid
yield 1
yield
zodb = z.zodb
for r in z.transaction:
oid = p64(u64(r.oid) + zodb.shift_oid)
......@@ -413,7 +413,7 @@ class ImporterDatabaseManager(DatabaseManager):
# update 'obj' with 'object_list', some rows in 'data' may be
# unreferenced. This is not a problem because the leak is
# solved when resuming the migration.
yield 1
yield
try:
z.next()
except StopIteration:
......
......@@ -157,7 +157,7 @@ class StorageOperationHandler(EventHandler):
conn.send(Packets.AnswerCheckTIDRange(*r), msg_id)
except (weakref.ReferenceError, ConnectionClosed):
pass
yield
return; yield
  • How do you make a generator function that stops at the first attempt to fetch a value ?

    Here, a task is used to postpone the work when the node is idle, but there's no point splitting the work in itself.

  • Ah, indeed. Then what about adding a comment and avoid the two-statements-on-one-line ? The code as-is looks to me like intermediate local change being accidentally committed.

  • Comment added (f2070ca4). The two-statements-on-one-line is on purpose, because it's really the combination of these 2 small statements that makes sense.

  • With the added comment, it's fine for me.

Please register or sign in to reply
app.newTask(check())
@checkFeedingConnection(check=True)
......@@ -173,7 +173,7 @@ class StorageOperationHandler(EventHandler):
conn.send(Packets.AnswerCheckSerialRange(*r), msg_id)
except (weakref.ReferenceError, ConnectionClosed):
pass
yield
return; yield
app.newTask(check())
@checkFeedingConnection(check=False)
......@@ -209,9 +209,17 @@ class StorageOperationHandler(EventHandler):
% partition), msg_id)
return
oid_list, user, desc, ext, packed, ttid = t
# Sending such packet does not mark the connection
# for writing if there's too little data in the buffer.
conn.send(Packets.AddTransaction(tid, user,
desc, ext, packed, ttid, oid_list), msg_id)
yield
# To avoid delaying several connections simultaneously,
# and also prevent the backend from scanning different
# parts of the DB at the same time, we ask the
# scheduler not to switch to another background task.
# Ideally, we are filling a buffer while the kernel
# is flushing another one for a concurrent connection.
yield conn.buffering
conn.send(Packets.AnswerFetchTransactions(
pack_tid, next_tid, peer_tid_set), msg_id)
yield
......@@ -253,9 +261,10 @@ class StorageOperationHandler(EventHandler):
"partition %u dropped or truncated"
% partition), msg_id)
return
# Same as in askFetchTransactions.
conn.send(Packets.AddObject(oid, serial, *object[2:]),
msg_id)
yield
yield conn.buffering
conn.send(Packets.AnswerFetchObjects(
pack_tid, next_tid, next_oid, object_dict), msg_id)
yield
......
  • Maybe we should not set TCP_ NODELAY on storage-storage connections ? Or maybe use TCP_CORK ?

  • I considered it but it would not be as efficient. For example, you'd still have overhead with a lot of epoll_*+send syscalls (+ the Python code to do them). It would also slow down control packets (AskFetch/AnswerFetch).

    Unsetting TCP_ NODELAY or setting TCP_CORK don't wait a long time. The difference with this commit may be negligible though. Note that for the type of packet that is buffered, it's fine if delay is unbound: on the receiving side, there may be no commit until the next AnswerFetch packet.

    TCP_CORK is not portable.

Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment