Commit 161cd45f authored by David S. Miller's avatar David S. Miller

Merge branch 'rds-mprds-foundations'

Sowmini Varadhan says:

====================
RDS: multiple connection paths for scaling

Today RDS-over-TCP is implemented by demux-ing multiple PF_RDS sockets
between any 2 endpoints (where endpoint == [IP address, port]) over a
single TCP socket between the 2 IP addresses involved. This has the
limitation that it ends up funneling multiple RDS flows over a single
TCP flow, thus the rds/tcp connection is
   (a) upper-bounded to the single-flow bandwidth,
   (b) suffers from head-of-line blocking for the RDS sockets.

Better throughput (for a fixed small packet size, MTU) can be achieved
by having multiple TCP/IP flows per rds/tcp connection, i.e., multipathed
RDS (mprds).  Each such TCP/IP flow constitutes a path for the rds/tcp
connection. RDS sockets will be attached to a path based on some hash
(e.g., of local address and RDS port number) and packets for that RDS
socket will be sent over the attached path using TCP to segment/reassemble
RDS datagrams on that path.

The table below, generated using a prototype that implements mprds,
shows that this is significant for scaling to 40G.  Packet sizes
used were: 8K byte req, 256 byte resp. MTU: 1500.  The parameters for
RDS-concurrency used below are described in the rds-stress(1) man page-
the number listed is proportional to the number of threads at which max
throughput was attained.

  -------------------------------------------------------------------
     RDS-concurrency   Num of       tx+rx K/s (iops)       throughput
     (-t N -d N)       TCP paths
  -------------------------------------------------------------------
        16             1             600K -  700K            4 Gbps
        28             8            5000K - 6000K           32 Gbps
  -------------------------------------------------------------------

FAQ: what is the relation between mprds and mptcp?
  mprds is orthogonal to mptcp. Whereas mptcp creates
  sub-flows for a single TCP connection, mprds parallelizes tx/rx
  at the RDS layer. MPRDS with N paths will allow N datagrams to
  be sent in parallel; each path will continue to send one
  datagram at a time, with sender and receiver keeping track of
  the retransmit and dgram-assembly state based on the RDS header.
  If desired, mptcp can additionally be used to speed up each TCP
  path. That acceleration is orthogonal to the parallelization benefits
  of mprds.

This patch series lays down the foundational data-structures to support
mprds in the kernel. It implements the changes to split up the
rds_connection structure into a common (to all paths) part,
and a per-path rds_conn_path. All I/O workqs are driven from
the rds_conn_path.

Note that this patchset does not (yet) actually enable multipathing
for any of the transports; all transports will continue to use a
single path with the refactored data-structures. A subsequent patchset
will  add the changes to the rds-tcp module to actually use mprds
in rds-tcp.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents dcf1158b 3ecc5693
...@@ -235,7 +235,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map) ...@@ -235,7 +235,8 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* therefore trigger warnings. * therefore trigger warnings.
* Defer the xmit to rds_send_worker() instead. * Defer the xmit to rds_send_worker() instead.
*/ */
queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq,
&conn->c_path[0].cp_send_w, 0);
} }
} }
......
This diff is collapsed.
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/module.h> #include <linux/module.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
#include "ib_mr.h" #include "ib_mr.h"
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
...@@ -273,7 +274,7 @@ static void rds_ib_tasklet_fn_send(unsigned long data) ...@@ -273,7 +274,7 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
if (rds_conn_up(conn) && if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) || (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued))) test_bit(0, &conn->c_map_queued)))
rds_send_xmit(ic->conn); rds_send_xmit(&ic->conn->c_path[0]);
} }
static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq, static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <linux/rculist.h> #include <linux/rculist.h>
#include <linux/llist.h> #include <linux/llist.h>
#include "rds_single_path.h"
#include "ib_mr.h" #include "ib_mr.h"
struct workqueue_struct *rds_ib_mr_wq; struct workqueue_struct *rds_ib_mr_wq;
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <rdma/rdma_cm.h> #include <rdma/rdma_cm.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/dmapool.h> #include <linux/dmapool.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/in.h> #include <linux/in.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "loop.h" #include "loop.h"
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <linux/module.h> #include <linux/module.h>
#include <rdma/rdma_cm.h> #include <rdma/rdma_cm.h>
#include "rds_single_path.h"
#include "rdma_transport.h" #include "rdma_transport.h"
#include "ib.h" #include "ib.h"
......
...@@ -84,56 +84,69 @@ enum { ...@@ -84,56 +84,69 @@ enum {
#define RDS_IN_XMIT 2 #define RDS_IN_XMIT 2
#define RDS_RECV_REFILL 3 #define RDS_RECV_REFILL 3
/* Max number of multipaths per RDS connection. Must be a power of 2 */
#define RDS_MPATH_WORKERS 1
/* Per mpath connection state */
struct rds_conn_path {
struct rds_connection *cp_conn;
struct rds_message *cp_xmit_rm;
unsigned long cp_xmit_sg;
unsigned int cp_xmit_hdr_off;
unsigned int cp_xmit_data_off;
unsigned int cp_xmit_atomic_sent;
unsigned int cp_xmit_rdma_sent;
unsigned int cp_xmit_data_sent;
spinlock_t cp_lock; /* protect msg queues */
u64 cp_next_tx_seq;
struct list_head cp_send_queue;
struct list_head cp_retrans;
u64 cp_next_rx_seq;
void *cp_transport_data;
atomic_t cp_state;
unsigned long cp_send_gen;
unsigned long cp_flags;
unsigned long cp_reconnect_jiffies;
struct delayed_work cp_send_w;
struct delayed_work cp_recv_w;
struct delayed_work cp_conn_w;
struct work_struct cp_down_w;
struct mutex cp_cm_lock; /* protect cp_state & cm */
wait_queue_head_t cp_waitq;
unsigned int cp_unacked_packets;
unsigned int cp_unacked_bytes;
unsigned int cp_outgoing:1,
cp_pad_to_32:31;
unsigned int cp_index;
};
/* One rds_connection per RDS address pair */
struct rds_connection { struct rds_connection {
struct hlist_node c_hash_node; struct hlist_node c_hash_node;
__be32 c_laddr; __be32 c_laddr;
__be32 c_faddr; __be32 c_faddr;
unsigned int c_loopback:1, unsigned int c_loopback:1,
c_outgoing:1, c_pad_to_32:31;
c_pad_to_32:30; int c_npaths;
struct rds_connection *c_passive; struct rds_connection *c_passive;
struct rds_transport *c_trans;
struct rds_cong_map *c_lcong; struct rds_cong_map *c_lcong;
struct rds_cong_map *c_fcong; struct rds_cong_map *c_fcong;
struct rds_message *c_xmit_rm; /* Protocol version */
unsigned long c_xmit_sg; unsigned int c_version;
unsigned int c_xmit_hdr_off; possible_net_t c_net;
unsigned int c_xmit_data_off;
unsigned int c_xmit_atomic_sent;
unsigned int c_xmit_rdma_sent;
unsigned int c_xmit_data_sent;
spinlock_t c_lock; /* protect msg queues */
u64 c_next_tx_seq;
struct list_head c_send_queue;
struct list_head c_retrans;
u64 c_next_rx_seq;
struct rds_transport *c_trans;
void *c_transport_data;
atomic_t c_state;
unsigned long c_send_gen;
unsigned long c_flags;
unsigned long c_reconnect_jiffies;
struct delayed_work c_send_w;
struct delayed_work c_recv_w;
struct delayed_work c_conn_w;
struct work_struct c_down_w;
struct mutex c_cm_lock; /* protect conn state & cm */
wait_queue_head_t c_waitq;
struct list_head c_map_item; struct list_head c_map_item;
unsigned long c_map_queued; unsigned long c_map_queued;
unsigned int c_unacked_packets; struct rds_conn_path c_path[RDS_MPATH_WORKERS];
unsigned int c_unacked_bytes;
/* Protocol version */
unsigned int c_version;
possible_net_t c_net;
}; };
static inline static inline
...@@ -218,6 +231,7 @@ struct rds_incoming { ...@@ -218,6 +231,7 @@ struct rds_incoming {
atomic_t i_refcount; atomic_t i_refcount;
struct list_head i_item; struct list_head i_item;
struct rds_connection *i_conn; struct rds_connection *i_conn;
struct rds_conn_path *i_conn_path;
struct rds_header i_hdr; struct rds_header i_hdr;
unsigned long i_rx_jiffies; unsigned long i_rx_jiffies;
__be32 i_saddr; __be32 i_saddr;
...@@ -433,7 +447,8 @@ struct rds_transport { ...@@ -433,7 +447,8 @@ struct rds_transport {
char t_name[TRANSNAMSIZ]; char t_name[TRANSNAMSIZ];
struct list_head t_item; struct list_head t_item;
struct module *t_owner; struct module *t_owner;
unsigned int t_prefer_loopback:1; unsigned int t_prefer_loopback:1,
t_mp_capable:1;
unsigned int t_type; unsigned int t_type;
int (*laddr_check)(struct net *net, __be32 addr); int (*laddr_check)(struct net *net, __be32 addr);
...@@ -441,8 +456,11 @@ struct rds_transport { ...@@ -441,8 +456,11 @@ struct rds_transport {
void (*conn_free)(void *data); void (*conn_free)(void *data);
int (*conn_connect)(struct rds_connection *conn); int (*conn_connect)(struct rds_connection *conn);
void (*conn_shutdown)(struct rds_connection *conn); void (*conn_shutdown)(struct rds_connection *conn);
void (*conn_path_shutdown)(struct rds_conn_path *conn);
void (*xmit_prepare)(struct rds_connection *conn); void (*xmit_prepare)(struct rds_connection *conn);
void (*xmit_path_prepare)(struct rds_conn_path *cp);
void (*xmit_complete)(struct rds_connection *conn); void (*xmit_complete)(struct rds_connection *conn);
void (*xmit_path_complete)(struct rds_conn_path *cp);
int (*xmit)(struct rds_connection *conn, struct rds_message *rm, int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off); unsigned int hdr_off, unsigned int sg, unsigned int off);
int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
...@@ -636,10 +654,12 @@ struct rds_connection *rds_conn_create(struct net *net, ...@@ -636,10 +654,12 @@ struct rds_connection *rds_conn_create(struct net *net,
struct rds_connection *rds_conn_create_outgoing(struct net *net, struct rds_connection *rds_conn_create_outgoing(struct net *net,
__be32 laddr, __be32 faddr, __be32 laddr, __be32 faddr,
struct rds_transport *trans, gfp_t gfp); struct rds_transport *trans, gfp_t gfp);
void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn);
void rds_conn_path_drop(struct rds_conn_path *cpath);
void rds_conn_connect_if_down(struct rds_connection *conn); void rds_conn_connect_if_down(struct rds_connection *conn);
void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
void rds_for_each_conn_info(struct socket *sock, unsigned int len, void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter, struct rds_info_iterator *iter,
struct rds_info_lengths *lens, struct rds_info_lengths *lens,
...@@ -650,28 +670,60 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...); ...@@ -650,28 +670,60 @@ void __rds_conn_error(struct rds_connection *conn, const char *, ...);
#define rds_conn_error(conn, fmt...) \ #define rds_conn_error(conn, fmt...) \
__rds_conn_error(conn, KERN_WARNING "RDS: " fmt) __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
#define rds_conn_path_error(cp, fmt...) \
__rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
static inline int
rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
{
return atomic_cmpxchg(&cp->cp_state, old, new) == old;
}
static inline int static inline int
rds_conn_transition(struct rds_connection *conn, int old, int new) rds_conn_transition(struct rds_connection *conn, int old, int new)
{ {
return atomic_cmpxchg(&conn->c_state, old, new) == old; WARN_ON(conn->c_trans->t_mp_capable);
return rds_conn_path_transition(&conn->c_path[0], old, new);
}
static inline int
rds_conn_path_state(struct rds_conn_path *cp)
{
return atomic_read(&cp->cp_state);
} }
static inline int static inline int
rds_conn_state(struct rds_connection *conn) rds_conn_state(struct rds_connection *conn)
{ {
return atomic_read(&conn->c_state); WARN_ON(conn->c_trans->t_mp_capable);
return rds_conn_path_state(&conn->c_path[0]);
}
static inline int
rds_conn_path_up(struct rds_conn_path *cp)
{
return atomic_read(&cp->cp_state) == RDS_CONN_UP;
} }
static inline int static inline int
rds_conn_up(struct rds_connection *conn) rds_conn_up(struct rds_connection *conn)
{ {
return atomic_read(&conn->c_state) == RDS_CONN_UP; WARN_ON(conn->c_trans->t_mp_capable);
return rds_conn_path_up(&conn->c_path[0]);
}
static inline int
rds_conn_path_connecting(struct rds_conn_path *cp)
{
return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
} }
static inline int static inline int
rds_conn_connecting(struct rds_connection *conn) rds_conn_connecting(struct rds_connection *conn)
{ {
return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING; WARN_ON(conn->c_trans->t_mp_capable);
return rds_conn_path_connecting(&conn->c_path[0]);
} }
/* message.c */ /* message.c */
...@@ -720,6 +772,8 @@ void rds_page_exit(void); ...@@ -720,6 +772,8 @@ void rds_page_exit(void);
/* recv.c */ /* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
__be32 saddr); __be32 saddr);
void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
__be32 saddr);
void rds_inc_put(struct rds_incoming *inc); void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_incoming *inc, gfp_t gfp); struct rds_incoming *inc, gfp_t gfp);
...@@ -733,16 +787,16 @@ void rds_inc_info_copy(struct rds_incoming *inc, ...@@ -733,16 +787,16 @@ void rds_inc_info_copy(struct rds_incoming *inc,
/* send.c */ /* send.c */
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len); int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
void rds_send_reset(struct rds_connection *conn); void rds_send_path_reset(struct rds_conn_path *conn);
int rds_send_xmit(struct rds_connection *conn); int rds_send_xmit(struct rds_conn_path *cp);
struct sockaddr_in; struct sockaddr_in;
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack, void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked); is_acked_func is_acked);
int rds_send_pong(struct rds_connection *conn, __be16 dport); void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
struct rds_message *rds_send_get_message(struct rds_connection *, is_acked_func is_acked);
struct rm_rdma_op *); int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
/* rdma.c */ /* rdma.c */
void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
...@@ -809,12 +863,12 @@ extern unsigned int rds_sysctl_trace_level; ...@@ -809,12 +863,12 @@ extern unsigned int rds_sysctl_trace_level;
int rds_threads_init(void); int rds_threads_init(void);
void rds_threads_exit(void); void rds_threads_exit(void);
extern struct workqueue_struct *rds_wq; extern struct workqueue_struct *rds_wq;
void rds_queue_reconnect(struct rds_connection *conn); void rds_queue_reconnect(struct rds_conn_path *cp);
void rds_connect_worker(struct work_struct *); void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *); void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *); void rds_recv_worker(struct work_struct *);
void rds_connect_path_complete(struct rds_connection *conn, int curr); void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
void rds_connect_complete(struct rds_connection *conn); void rds_connect_complete(struct rds_connection *conn);
/* transport.c */ /* transport.c */
......
#ifndef _RDS_RDS_SINGLE_H
#define _RDS_RDS_SINGLE_H
#define c_xmit_rm c_path[0].cp_xmit_rm
#define c_xmit_sg c_path[0].cp_xmit_sg
#define c_xmit_hdr_off c_path[0].cp_xmit_hdr_off
#define c_xmit_data_off c_path[0].cp_xmit_data_off
#define c_xmit_atomic_sent c_path[0].cp_xmit_atomic_sent
#define c_xmit_rdma_sent c_path[0].cp_xmit_rdma_sent
#define c_xmit_data_sent c_path[0].cp_xmit_data_sent
#define c_lock c_path[0].cp_lock
#define c_next_tx_seq c_path[0].cp_next_tx_seq
#define c_send_queue c_path[0].cp_send_queue
#define c_retrans c_path[0].cp_retrans
#define c_next_rx_seq c_path[0].cp_next_rx_seq
#define c_transport_data c_path[0].cp_transport_data
#define c_state c_path[0].cp_state
#define c_send_gen c_path[0].cp_send_gen
#define c_flags c_path[0].cp_flags
#define c_reconnect_jiffies c_path[0].cp_reconnect_jiffies
#define c_send_w c_path[0].cp_send_w
#define c_recv_w c_path[0].cp_recv_w
#define c_conn_w c_path[0].cp_conn_w
#define c_down_w c_path[0].cp_down_w
#define c_cm_lock c_path[0].cp_cm_lock
#define c_waitq c_path[0].cp_waitq
#define c_unacked_packets c_path[0].cp_unacked_packets
#define c_unacked_bytes c_path[0].cp_unacked_bytes
#endif /* _RDS_RDS_SINGLE_H */
...@@ -53,6 +53,20 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, ...@@ -53,6 +53,20 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
} }
EXPORT_SYMBOL_GPL(rds_inc_init); EXPORT_SYMBOL_GPL(rds_inc_init);
void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
__be32 saddr)
{
atomic_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = cp->cp_conn;
inc->i_conn_path = cp;
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
inc->i_rx_tstamp.tv_sec = 0;
inc->i_rx_tstamp.tv_usec = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_path_init);
static void rds_inc_addref(struct rds_incoming *inc) static void rds_inc_addref(struct rds_incoming *inc)
{ {
rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
...@@ -164,13 +178,18 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, ...@@ -164,13 +178,18 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
struct rds_sock *rs = NULL; struct rds_sock *rs = NULL;
struct sock *sk; struct sock *sk;
unsigned long flags; unsigned long flags;
struct rds_conn_path *cp;
inc->i_conn = conn; inc->i_conn = conn;
inc->i_rx_jiffies = jiffies; inc->i_rx_jiffies = jiffies;
if (conn->c_trans->t_mp_capable)
cp = inc->i_conn_path;
else
cp = &conn->c_path[0];
rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u " rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
"flags 0x%x rx_jiffies %lu\n", conn, "flags 0x%x rx_jiffies %lu\n", conn,
(unsigned long long)conn->c_next_rx_seq, (unsigned long long)cp->cp_next_rx_seq,
inc, inc,
(unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence), (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
be32_to_cpu(inc->i_hdr.h_len), be32_to_cpu(inc->i_hdr.h_len),
...@@ -199,16 +218,16 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, ...@@ -199,16 +218,16 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
* XXX we could spend more on the wire to get more robust failure * XXX we could spend more on the wire to get more robust failure
* detection, arguably worth it to avoid data corruption. * detection, arguably worth it to avoid data corruption.
*/ */
if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq && if (be64_to_cpu(inc->i_hdr.h_sequence) < cp->cp_next_rx_seq &&
(inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) { (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
rds_stats_inc(s_recv_drop_old_seq); rds_stats_inc(s_recv_drop_old_seq);
goto out; goto out;
} }
conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1; cp->cp_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) { if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
rds_stats_inc(s_recv_ping); rds_stats_inc(s_recv_ping);
rds_send_pong(conn, inc->i_hdr.h_sport); rds_send_pong(cp, inc->i_hdr.h_sport);
goto out; goto out;
} }
......
This diff is collapsed.
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include <net/net_namespace.h> #include <net/net_namespace.h>
#include <net/netns/generic.h> #include <net/netns/generic.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "tcp.h" #include "tcp.h"
...@@ -185,7 +186,7 @@ void rds_tcp_reset_callbacks(struct socket *sock, ...@@ -185,7 +186,7 @@ void rds_tcp_reset_callbacks(struct socket *sock,
release_sock(osock->sk); release_sock(osock->sk);
sock_release(osock); sock_release(osock);
newsock: newsock:
rds_send_reset(conn); rds_send_path_reset(&conn->c_path[0]);
lock_sock(sock->sk); lock_sock(sock->sk);
write_lock_bh(&sock->sk->sk_callback_lock); write_lock_bh(&sock->sk->sk_callback_lock);
tc->t_sock = sock; tc->t_sock = sock;
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <linux/in.h> #include <linux/in.h>
#include <net/tcp.h> #include <net/tcp.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "tcp.h" #include "tcp.h"
...@@ -60,7 +61,8 @@ void rds_tcp_state_change(struct sock *sk) ...@@ -60,7 +61,8 @@ void rds_tcp_state_change(struct sock *sk)
case TCP_SYN_RECV: case TCP_SYN_RECV:
break; break;
case TCP_ESTABLISHED: case TCP_ESTABLISHED:
rds_connect_path_complete(conn, RDS_CONN_CONNECTING); rds_connect_path_complete(&conn->c_path[0],
RDS_CONN_CONNECTING);
break; break;
case TCP_CLOSE_WAIT: case TCP_CLOSE_WAIT:
case TCP_CLOSE: case TCP_CLOSE:
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include <linux/in.h> #include <linux/in.h>
#include <net/tcp.h> #include <net/tcp.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "tcp.h" #include "tcp.h"
...@@ -132,17 +133,19 @@ int rds_tcp_accept_one(struct socket *sock) ...@@ -132,17 +133,19 @@ int rds_tcp_accept_one(struct socket *sock)
* c_transport_data. * c_transport_data.
*/ */
if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) || if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) ||
!conn->c_outgoing) { !conn->c_path[0].cp_outgoing) {
goto rst_nsk; goto rst_nsk;
} else { } else {
rds_tcp_reset_callbacks(new_sock, conn); rds_tcp_reset_callbacks(new_sock, conn);
conn->c_outgoing = 0; conn->c_path[0].cp_outgoing = 0;
/* rds_connect_path_complete() marks RDS_CONN_UP */ /* rds_connect_path_complete() marks RDS_CONN_UP */
rds_connect_path_complete(conn, RDS_CONN_DISCONNECTING); rds_connect_path_complete(&conn->c_path[0],
RDS_CONN_DISCONNECTING);
} }
} else { } else {
rds_tcp_set_callbacks(new_sock, conn); rds_tcp_set_callbacks(new_sock, conn);
rds_connect_path_complete(conn, RDS_CONN_CONNECTING); rds_connect_path_complete(&conn->c_path[0],
RDS_CONN_CONNECTING);
} }
new_sock = NULL; new_sock = NULL;
ret = 0; ret = 0;
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <net/tcp.h> #include <net/tcp.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "tcp.h" #include "tcp.h"
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include <linux/in.h> #include <linux/in.h>
#include <net/tcp.h> #include <net/tcp.h>
#include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "tcp.h" #include "tcp.h"
......
...@@ -71,30 +71,30 @@ ...@@ -71,30 +71,30 @@
struct workqueue_struct *rds_wq; struct workqueue_struct *rds_wq;
EXPORT_SYMBOL_GPL(rds_wq); EXPORT_SYMBOL_GPL(rds_wq);
void rds_connect_path_complete(struct rds_connection *conn, int curr) void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
{ {
if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) { if (!rds_conn_path_transition(cp, curr, RDS_CONN_UP)) {
printk(KERN_WARNING "%s: Cannot transition to state UP, " printk(KERN_WARNING "%s: Cannot transition to state UP, "
"current state is %d\n", "current state is %d\n",
__func__, __func__,
atomic_read(&conn->c_state)); atomic_read(&cp->cp_state));
rds_conn_drop(conn); rds_conn_path_drop(cp);
return; return;
} }
rdsdebug("conn %p for %pI4 to %pI4 complete\n", rdsdebug("conn %p for %pI4 to %pI4 complete\n",
conn, &conn->c_laddr, &conn->c_faddr); cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
conn->c_reconnect_jiffies = 0; cp->cp_reconnect_jiffies = 0;
set_bit(0, &conn->c_map_queued); set_bit(0, &cp->cp_conn->c_map_queued);
queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
} }
EXPORT_SYMBOL_GPL(rds_connect_path_complete); EXPORT_SYMBOL_GPL(rds_connect_path_complete);
void rds_connect_complete(struct rds_connection *conn) void rds_connect_complete(struct rds_connection *conn)
{ {
rds_connect_path_complete(conn, RDS_CONN_CONNECTING); rds_connect_path_complete(&conn->c_path[0], RDS_CONN_CONNECTING);
} }
EXPORT_SYMBOL_GPL(rds_connect_complete); EXPORT_SYMBOL_GPL(rds_connect_complete);
...@@ -116,70 +116,79 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); ...@@ -116,70 +116,79 @@ EXPORT_SYMBOL_GPL(rds_connect_complete);
* We should *always* start with a random backoff; otherwise a broken connection * We should *always* start with a random backoff; otherwise a broken connection
* will always take several iterations to be re-established. * will always take several iterations to be re-established.
*/ */
void rds_queue_reconnect(struct rds_connection *conn) void rds_queue_reconnect(struct rds_conn_path *cp)
{ {
unsigned long rand; unsigned long rand;
struct rds_connection *conn = cp->cp_conn;
rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n", rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
conn, &conn->c_laddr, &conn->c_faddr, conn, &conn->c_laddr, &conn->c_faddr,
conn->c_reconnect_jiffies); cp->cp_reconnect_jiffies);
set_bit(RDS_RECONNECT_PENDING, &conn->c_flags); set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
if (conn->c_reconnect_jiffies == 0) { if (cp->cp_reconnect_jiffies == 0) {
conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
queue_delayed_work(rds_wq, &conn->c_conn_w, 0); queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
return; return;
} }
get_random_bytes(&rand, sizeof(rand)); get_random_bytes(&rand, sizeof(rand));
rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n", rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies, rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr); conn, &conn->c_laddr, &conn->c_faddr);
queue_delayed_work(rds_wq, &conn->c_conn_w, queue_delayed_work(rds_wq, &cp->cp_conn_w,
rand % conn->c_reconnect_jiffies); rand % cp->cp_reconnect_jiffies);
conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2, cp->cp_reconnect_jiffies = min(cp->cp_reconnect_jiffies * 2,
rds_sysctl_reconnect_max_jiffies); rds_sysctl_reconnect_max_jiffies);
} }
void rds_connect_worker(struct work_struct *work) void rds_connect_worker(struct work_struct *work)
{ {
struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work); struct rds_conn_path *cp = container_of(work,
struct rds_conn_path,
cp_conn_w.work);
struct rds_connection *conn = cp->cp_conn;
int ret; int ret;
clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags); clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) { if (rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
ret = conn->c_trans->conn_connect(conn); ret = conn->c_trans->conn_connect(conn);
rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n", rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
conn, &conn->c_laddr, &conn->c_faddr, ret); conn, &conn->c_laddr, &conn->c_faddr, ret);
if (ret) { if (ret) {
if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN)) if (rds_conn_path_transition(cp,
rds_queue_reconnect(conn); RDS_CONN_CONNECTING,
RDS_CONN_DOWN))
rds_queue_reconnect(cp);
else else
rds_conn_error(conn, "RDS: connect failed\n"); rds_conn_path_error(cp,
"RDS: connect failed\n");
} }
} }
} }
void rds_send_worker(struct work_struct *work) void rds_send_worker(struct work_struct *work)
{ {
struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); struct rds_conn_path *cp = container_of(work,
struct rds_conn_path,
cp_send_w.work);
int ret; int ret;
if (rds_conn_state(conn) == RDS_CONN_UP) { if (rds_conn_path_state(cp) == RDS_CONN_UP) {
clear_bit(RDS_LL_SEND_FULL, &conn->c_flags); clear_bit(RDS_LL_SEND_FULL, &cp->cp_flags);
ret = rds_send_xmit(conn); ret = rds_send_xmit(cp);
cond_resched(); cond_resched();
rdsdebug("conn %p ret %d\n", conn, ret); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) { switch (ret) {
case -EAGAIN: case -EAGAIN:
rds_stats_inc(s_send_immediate_retry); rds_stats_inc(s_send_immediate_retry);
queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
break; break;
case -ENOMEM: case -ENOMEM:
rds_stats_inc(s_send_delayed_retry); rds_stats_inc(s_send_delayed_retry);
queue_delayed_work(rds_wq, &conn->c_send_w, 2); queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
default: default:
break; break;
} }
...@@ -188,20 +197,22 @@ void rds_send_worker(struct work_struct *work) ...@@ -188,20 +197,22 @@ void rds_send_worker(struct work_struct *work)
void rds_recv_worker(struct work_struct *work) void rds_recv_worker(struct work_struct *work)
{ {
struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work); struct rds_conn_path *cp = container_of(work,
struct rds_conn_path,
cp_recv_w.work);
int ret; int ret;
if (rds_conn_state(conn) == RDS_CONN_UP) { if (rds_conn_path_state(cp) == RDS_CONN_UP) {
ret = conn->c_trans->recv(conn); ret = cp->cp_conn->c_trans->recv(cp->cp_conn);
rdsdebug("conn %p ret %d\n", conn, ret); rdsdebug("conn %p ret %d\n", cp->cp_conn, ret);
switch (ret) { switch (ret) {
case -EAGAIN: case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry); rds_stats_inc(s_recv_immediate_retry);
queue_delayed_work(rds_wq, &conn->c_recv_w, 0); queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
break; break;
case -ENOMEM: case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry); rds_stats_inc(s_recv_delayed_retry);
queue_delayed_work(rds_wq, &conn->c_recv_w, 2); queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
default: default:
break; break;
} }
...@@ -210,9 +221,11 @@ void rds_recv_worker(struct work_struct *work) ...@@ -210,9 +221,11 @@ void rds_recv_worker(struct work_struct *work)
void rds_shutdown_worker(struct work_struct *work) void rds_shutdown_worker(struct work_struct *work)
{ {
struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); struct rds_conn_path *cp = container_of(work,
struct rds_conn_path,
cp_down_w);
rds_conn_shutdown(conn); rds_conn_shutdown(cp);
} }
void rds_threads_exit(void) void rds_threads_exit(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment