Commit 5bdca4e0 authored by Sage Weil's avatar Sage Weil

libceph: fix messenger retry

In ancient times, the messenger could both initiate and accept connections.
An artifact if that was data structures to store/process an incoming
ceph_msg_connect request and send an outgoing ceph_msg_connect_reply.
Sadly, the negotiation code was referencing those structures and ignoring
important information (like the peer's connect_seq) from the correct ones.

Among other things, this fixes tight reconnect loops where the server sends
RETRY_SESSION and we (the client) retries with the same connect_seq as last
time.  This bug pretty easily triggered by injecting socket failures on the
MDS and running some fs workload like workunits/direct_io/test_sync_io.
Signed-off-by: default avatarSage Weil <sage@inktank.com>
parent a0185401
......@@ -163,16 +163,8 @@ struct ceph_connection {
/* connection negotiation temps */
char in_banner[CEPH_BANNER_MAX_LEN];
union {
struct { /* outgoing connection */
struct ceph_msg_connect out_connect;
struct ceph_msg_connect_reply in_reply;
};
struct { /* incoming */
struct ceph_msg_connect in_connect;
struct ceph_msg_connect_reply out_reply;
};
};
struct ceph_msg_connect out_connect;
struct ceph_msg_connect_reply in_reply;
struct ceph_entity_addr actual_peer_addr;
/* message out temps */
......
......@@ -1423,7 +1423,7 @@ static int process_connect(struct ceph_connection *con)
* dropped messages.
*/
dout("process_connect got RESET peer seq %u\n",
le32_to_cpu(con->in_connect.connect_seq));
le32_to_cpu(con->in_reply.connect_seq));
pr_err("%s%lld %s connection reset\n",
ENTITY_NAME(con->peer_name),
ceph_pr_addr(&con->peer_addr.in_addr));
......@@ -1450,10 +1450,10 @@ static int process_connect(struct ceph_connection *con)
* If we sent a smaller connect_seq than the peer has, try
* again with a larger value.
*/
dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
le32_to_cpu(con->out_connect.connect_seq),
le32_to_cpu(con->in_connect.connect_seq));
con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
le32_to_cpu(con->in_reply.connect_seq));
con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con);
if (ret < 0)
......@@ -1468,9 +1468,9 @@ static int process_connect(struct ceph_connection *con)
*/
dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
con->peer_global_seq,
le32_to_cpu(con->in_connect.global_seq));
le32_to_cpu(con->in_reply.global_seq));
get_global_seq(con->msgr,
le32_to_cpu(con->in_connect.global_seq));
le32_to_cpu(con->in_reply.global_seq));
ceph_con_out_kvec_reset(con);
ret = prepare_write_connect(con);
if (ret < 0)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment