Commit f7f35c02 authored by David S. Miller's avatar David S. Miller

Merge branch 'rds-next'

Sowmini Varadhan says:

====================
net/rds: SOL_RDS socket option to explicitly select transport

Today the underlying transport (TCP or IB) for a PF_RDS socket is
implicitly selected based on the local address used to bind(2) the
PF_RDS socket. This results in some non-deterministic behavior when
there are un-numbered and IPoIB interfaces sharing the same IP address.
It also places the constraint that the IB interface must have an IP
address (and thus, IPoIB) configured on it.

The non-determinism may be avoided by providing the user-space application
a socket option that allows it to explicitly select the transport
prior to bind(2).

Patch 1 of this series provides the constant definitions needed by
the application via <linux/rds.h>.

Patch 2 provides the setsockopt support, and Patch 3 provides the
getsockopt support.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f16e9d86 8ba38460
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
#define RDS_IB_ABI_VERSION 0x301 #define RDS_IB_ABI_VERSION 0x301
#define SOL_RDS 276
/* /*
* setsockopt/getsockopt for SOL_RDS * setsockopt/getsockopt for SOL_RDS
*/ */
...@@ -48,6 +50,14 @@ ...@@ -48,6 +50,14 @@
#define RDS_RECVERR 5 #define RDS_RECVERR 5
#define RDS_CONG_MONITOR 6 #define RDS_CONG_MONITOR 6
#define RDS_GET_MR_FOR_DEST 7 #define RDS_GET_MR_FOR_DEST 7
#define SO_RDS_TRANSPORT 8
/* supported values for SO_RDS_TRANSPORT */
#define RDS_TRANS_IB 0
#define RDS_TRANS_IWARP 1
#define RDS_TRANS_TCP 2
#define RDS_TRANS_COUNT 3
#define RDS_TRANS_NONE (~0)
/* /*
* Control message types for SOL_RDS. * Control message types for SOL_RDS.
......
...@@ -270,6 +270,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval, ...@@ -270,6 +270,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
return ret; return ret;
} }
static int rds_set_transport(struct rds_sock *rs, char __user *optval,
int optlen)
{
int t_type;
if (rs->rs_transport)
return -EOPNOTSUPP; /* previously attached to transport */
if (optlen != sizeof(int))
return -EINVAL;
if (copy_from_user(&t_type, (int __user *)optval, sizeof(t_type)))
return -EFAULT;
if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
return -EINVAL;
rs->rs_transport = rds_trans_get(t_type);
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
static int rds_setsockopt(struct socket *sock, int level, int optname, static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen) char __user *optval, unsigned int optlen)
{ {
...@@ -300,6 +322,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname, ...@@ -300,6 +322,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
case RDS_CONG_MONITOR: case RDS_CONG_MONITOR:
ret = rds_cong_monitor(rs, optval, optlen); ret = rds_cong_monitor(rs, optval, optlen);
break; break;
case SO_RDS_TRANSPORT:
lock_sock(sock->sk);
ret = rds_set_transport(rs, optval, optlen);
release_sock(sock->sk);
break;
default: default:
ret = -ENOPROTOOPT; ret = -ENOPROTOOPT;
} }
...@@ -312,6 +339,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, ...@@ -312,6 +339,7 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
{ {
struct rds_sock *rs = rds_sk_to_rs(sock->sk); struct rds_sock *rs = rds_sk_to_rs(sock->sk);
int ret = -ENOPROTOOPT, len; int ret = -ENOPROTOOPT, len;
int trans;
if (level != SOL_RDS) if (level != SOL_RDS)
goto out; goto out;
...@@ -337,6 +365,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname, ...@@ -337,6 +365,19 @@ static int rds_getsockopt(struct socket *sock, int level, int optname,
else else
ret = 0; ret = 0;
break; break;
case SO_RDS_TRANSPORT:
if (len < sizeof(int)) {
ret = -EINVAL;
break;
}
trans = (rs->rs_transport ? rs->rs_transport->t_type :
RDS_TRANS_NONE); /* unbound */
if (put_user(trans, (int __user *)optval) ||
put_user(sizeof(int), optlen))
ret = -EFAULT;
else
ret = 0;
break;
default: default:
break; break;
} }
......
...@@ -181,6 +181,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ...@@ -181,6 +181,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
if (ret) if (ret)
goto out; goto out;
if (rs->rs_transport) { /* previously bound */
ret = 0;
goto out;
}
trans = rds_trans_get_preferred(sin->sin_addr.s_addr); trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
if (!trans) { if (!trans) {
ret = -EADDRNOTAVAIL; ret = -EADDRNOTAVAIL;
......
...@@ -408,11 +408,6 @@ struct rds_notifier { ...@@ -408,11 +408,6 @@ struct rds_notifier {
* should try hard not to block. * should try hard not to block.
*/ */
#define RDS_TRANS_IB 0
#define RDS_TRANS_IWARP 1
#define RDS_TRANS_TCP 2
#define RDS_TRANS_COUNT 3
struct rds_transport { struct rds_transport {
char t_name[TRANSNAMSIZ]; char t_name[TRANSNAMSIZ];
struct list_head t_item; struct list_head t_item;
...@@ -803,6 +798,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr); ...@@ -803,6 +798,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr);
void rds_trans_put(struct rds_transport *trans); void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail); unsigned int avail);
struct rds_transport *rds_trans_get(int t_type);
int rds_trans_init(void); int rds_trans_init(void);
void rds_trans_exit(void); void rds_trans_exit(void);
......
...@@ -101,6 +101,27 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr) ...@@ -101,6 +101,27 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr)
return ret; return ret;
} }
struct rds_transport *rds_trans_get(int t_type)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
unsigned int i;
down_read(&rds_trans_sem);
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
if (trans && trans->t_type == t_type &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;
}
}
up_read(&rds_trans_sem);
return ret;
}
/* /*
* This returns the number of stats entries in the snapshot and only * This returns the number of stats entries in the snapshot and only
* copies them using the iter if there is enough space for them. The * copies them using the iter if there is enough space for them. The
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment