Commit 839fcaba authored by Michael S. Tsirkin's avatar Michael S. Tsirkin Committed by Roland Dreier

IPoIB: Connected mode experimental support

The following patch adds experimental support for IPoIB connected
mode, as defined by the draft from the IETF ipoib working group.  The
idea is to increase performance by increasing the MTU from the maximum
of 2K (theoretically 4K) supported by IPoIB on top of UD.  With this
code, I'm able to get 800MByte/sec or more with netperf without
options on a Mellanox 4x back-to-back DDR system.

Some notes on code:
1. SRQ is used for scalability to large cluster sizes
2. Only RC connections are used (UC does not support SRQ now)
3. Retry count is set to 0 since spec draft warns against retries
4. Each connection is used for data transfers in only 1 direction, so
   each connection is either active(TX) or passive (RX).  2 sides that
   want to communicate create 2 connections.
5. Each active (TX) connection has a separate CQ for send completions -
   this keeps the code simple without CQ resize and other tricks
6. To detect stale passive side connections (where the remote side is
   down), we keep an LRU list of passive connections (updated once per
   second per connection) and destroy a connection after it has been
   unused for several seconds. The LRU rule makes it possible to avoid
   scanning connections that have recently been active.
Signed-off-by: default avatarMichael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 9a6b090c
config INFINIBAND_IPOIB config INFINIBAND_IPOIB
tristate "IP-over-InfiniBand" tristate "IP-over-InfiniBand"
depends on INFINIBAND && NETDEVICES && INET depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n)
---help--- ---help---
Support for the IP-over-InfiniBand protocol (IPoIB). This Support for the IP-over-InfiniBand protocol (IPoIB). This
transports IP packets over InfiniBand so you can use your IB transports IP packets over InfiniBand so you can use your IB
...@@ -8,6 +8,20 @@ config INFINIBAND_IPOIB ...@@ -8,6 +8,20 @@ config INFINIBAND_IPOIB
See Documentation/infiniband/ipoib.txt for more information See Documentation/infiniband/ipoib.txt for more information
config INFINIBAND_IPOIB_CM
bool "IP-over-InfiniBand Connected Mode support"
depends on INFINIBAND_IPOIB && EXPERIMENTAL
default n
---help---
This option enables experimental support for IPoIB connected mode.
After enabling this option, you need to switch to connected mode through
/sys/class/net/ibXXX/mode to actually create connections, and then increase
the interface MTU with e.g. ifconfig ib0 mtu 65520.
WARNING: Enabling connected mode will trigger some
packet drops for multicast and UD mode traffic from this interface,
unless you limit mtu for these destinations to 2044.
config INFINIBAND_IPOIB_DEBUG config INFINIBAND_IPOIB_DEBUG
bool "IP-over-InfiniBand debugging" if EMBEDDED bool "IP-over-InfiniBand debugging" if EMBEDDED
depends on INFINIBAND_IPOIB depends on INFINIBAND_IPOIB
......
...@@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \ ...@@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \
ipoib_multicast.o \ ipoib_multicast.o \
ipoib_verbs.o \ ipoib_verbs.o \
ipoib_vlan.o ipoib_vlan.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM) += ipoib_cm.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG) += ipoib_fs.o
...@@ -62,6 +62,10 @@ enum { ...@@ -62,6 +62,10 @@ enum {
IPOIB_ENCAP_LEN = 4, IPOIB_ENCAP_LEN = 4,
IPOIB_CM_MTU = 0x10000 - 0x10, /* padding to align header to 16 */
IPOIB_CM_BUF_SIZE = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
IPOIB_CM_HEAD_SIZE = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
IPOIB_CM_RX_SG = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
IPOIB_RX_RING_SIZE = 128, IPOIB_RX_RING_SIZE = 128,
IPOIB_TX_RING_SIZE = 64, IPOIB_TX_RING_SIZE = 64,
IPOIB_MAX_QUEUE_SIZE = 8192, IPOIB_MAX_QUEUE_SIZE = 8192,
...@@ -81,6 +85,8 @@ enum { ...@@ -81,6 +85,8 @@ enum {
IPOIB_MCAST_RUN = 6, IPOIB_MCAST_RUN = 6,
IPOIB_STOP_REAPER = 7, IPOIB_STOP_REAPER = 7,
IPOIB_MCAST_STARTED = 8, IPOIB_MCAST_STARTED = 8,
IPOIB_FLAG_NETIF_STOPPED = 9,
IPOIB_FLAG_ADMIN_CM = 10,
IPOIB_MAX_BACKOFF_SECONDS = 16, IPOIB_MAX_BACKOFF_SECONDS = 16,
...@@ -90,6 +96,13 @@ enum { ...@@ -90,6 +96,13 @@ enum {
IPOIB_MCAST_FLAG_ATTACHED = 3, IPOIB_MCAST_FLAG_ATTACHED = 3,
}; };
#define IPOIB_OP_RECV (1ul << 31)
#ifdef CONFIG_INFINIBAND_IPOIB_CM
#define IPOIB_CM_OP_SRQ (1ul << 30)
#else
#define IPOIB_CM_OP_SRQ (0)
#endif
/* structs */ /* structs */
struct ipoib_header { struct ipoib_header {
...@@ -113,6 +126,59 @@ struct ipoib_tx_buf { ...@@ -113,6 +126,59 @@ struct ipoib_tx_buf {
u64 mapping; u64 mapping;
}; };
struct ib_cm_id;
struct ipoib_cm_data {
__be32 qpn; /* High byte MUST be ignored on receive */
__be32 mtu;
};
struct ipoib_cm_rx {
struct ib_cm_id *id;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
unsigned long jiffies;
};
struct ipoib_cm_tx {
struct ib_cm_id *id;
struct ib_cq *cq;
struct ib_qp *qp;
struct list_head list;
struct net_device *dev;
struct ipoib_neigh *neigh;
struct ipoib_path *path;
struct ipoib_tx_buf *tx_ring;
unsigned tx_head;
unsigned tx_tail;
unsigned long flags;
u32 mtu;
struct ib_wc ibwc[IPOIB_NUM_WC];
};
struct ipoib_cm_rx_buf {
struct sk_buff *skb;
u64 mapping[IPOIB_CM_RX_SG];
};
struct ipoib_cm_dev_priv {
struct ib_srq *srq;
struct ipoib_cm_rx_buf *srq_ring;
struct ib_cm_id *id;
struct list_head passive_ids;
struct work_struct start_task;
struct work_struct reap_task;
struct work_struct skb_task;
struct delayed_work stale_task;
struct sk_buff_head skb_queue;
struct list_head start_list;
struct list_head reap_list;
struct ib_wc ibwc[IPOIB_NUM_WC];
struct ib_sge rx_sge[IPOIB_CM_RX_SG];
struct ib_recv_wr rx_wr;
};
/* /*
* Device private locking: tx_lock protects members used in TX fast * Device private locking: tx_lock protects members used in TX fast
* path (and we use LLTX so upper layers don't do extra locking). * path (and we use LLTX so upper layers don't do extra locking).
...@@ -179,6 +245,10 @@ struct ipoib_dev_priv { ...@@ -179,6 +245,10 @@ struct ipoib_dev_priv {
struct list_head child_intfs; struct list_head child_intfs;
struct list_head list; struct list_head list;
#ifdef CONFIG_INFINIBAND_IPOIB_CM
struct ipoib_cm_dev_priv cm;
#endif
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
struct list_head fs_list; struct list_head fs_list;
struct dentry *mcg_dentry; struct dentry *mcg_dentry;
...@@ -212,6 +282,9 @@ struct ipoib_path { ...@@ -212,6 +282,9 @@ struct ipoib_path {
struct ipoib_neigh { struct ipoib_neigh {
struct ipoib_ah *ah; struct ipoib_ah *ah;
#ifdef CONFIG_INFINIBAND_IPOIB_CM
struct ipoib_cm_tx *cm;
#endif
union ib_gid dgid; union ib_gid dgid;
struct sk_buff_head queue; struct sk_buff_head queue;
...@@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey); ...@@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
void ipoib_pkey_poll(struct work_struct *work); void ipoib_pkey_poll(struct work_struct *work);
int ipoib_pkey_dev_delay_open(struct net_device *dev); int ipoib_pkey_dev_delay_open(struct net_device *dev);
#ifdef CONFIG_INFINIBAND_IPOIB_CM
#define IPOIB_FLAGS_RC 0x80
#define IPOIB_FLAGS_UC 0x40
/* We don't support UC connections at the moment */
#define IPOIB_CM_SUPPORTED(ha) (ha[0] & (IPOIB_FLAGS_RC))
static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
return IPOIB_CM_SUPPORTED(n->ha) &&
test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}
static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
{
return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
}
static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
{
return neigh->cm;
}
static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
{
neigh->cm = tx;
}
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
int ipoib_cm_dev_open(struct net_device *dev);
void ipoib_cm_dev_stop(struct net_device *dev);
int ipoib_cm_dev_init(struct net_device *dev);
int ipoib_cm_add_mode_attr(struct net_device *dev);
void ipoib_cm_dev_cleanup(struct net_device *dev);
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
struct ipoib_neigh *neigh);
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
unsigned int mtu);
void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
#else
struct ipoib_cm_tx;
static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
return 0;
}
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
{
return 0;
}
static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
{
return 0;
}
static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
{
return NULL;
}
static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
{
}
static inline
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
{
return;
}
static inline
int ipoib_cm_dev_open(struct net_device *dev)
{
return 0;
}
static inline
void ipoib_cm_dev_stop(struct net_device *dev)
{
return;
}
static inline
int ipoib_cm_dev_init(struct net_device *dev)
{
return -ENOSYS;
}
static inline
void ipoib_cm_dev_cleanup(struct net_device *dev)
{
return;
}
static inline
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
struct ipoib_neigh *neigh)
{
return NULL;
}
static inline
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
{
return;
}
static inline
int ipoib_cm_add_mode_attr(struct net_device *dev)
{
return 0;
}
static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
unsigned int mtu)
{
dev_kfree_skb_any(skb);
}
static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
{
}
#endif
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
void ipoib_create_debug_files(struct net_device *dev); void ipoib_create_debug_files(struct net_device *dev);
void ipoib_delete_debug_files(struct net_device *dev); void ipoib_delete_debug_files(struct net_device *dev);
...@@ -392,4 +605,6 @@ extern int ipoib_debug_level; ...@@ -392,4 +605,6 @@ extern int ipoib_debug_level;
#define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw) #define IPOIB_GID_ARG(gid) IPOIB_GID_RAW_ARG((gid).raw)
#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
#endif /* _IPOIB_H */ #endif /* _IPOIB_H */
This diff is collapsed.
...@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level, ...@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
"Enable data path debug tracing if > 0"); "Enable data path debug tracing if > 0");
#endif #endif
#define IPOIB_OP_RECV (1ul << 31)
static DEFINE_MUTEX(pkey_mutex); static DEFINE_MUTEX(pkey_mutex);
struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
...@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
spin_lock_irqsave(&priv->tx_lock, flags); spin_lock_irqsave(&priv->tx_lock, flags);
++priv->tx_tail; ++priv->tx_tail;
if (netif_queue_stopped(dev) && if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) && priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
netif_wake_queue(dev); netif_wake_queue(dev);
}
spin_unlock_irqrestore(&priv->tx_lock, flags); spin_unlock_irqrestore(&priv->tx_lock, flags);
if (wc->status != IB_WC_SUCCESS && if (wc->status != IB_WC_SUCCESS &&
...@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc) static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
{ {
if (wc->wr_id & IPOIB_OP_RECV) if (wc->wr_id & IPOIB_CM_OP_SRQ)
ipoib_cm_handle_rx_wc(dev, wc);
else if (wc->wr_id & IPOIB_OP_RECV)
ipoib_ib_handle_rx_wc(dev, wc); ipoib_ib_handle_rx_wc(dev, wc);
else else
ipoib_ib_handle_tx_wc(dev, wc); ipoib_ib_handle_tx_wc(dev, wc);
...@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ...@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
struct ipoib_tx_buf *tx_req; struct ipoib_tx_buf *tx_req;
u64 addr; u64 addr;
if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) { if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
skb->len, dev->mtu + INFINIBAND_ALEN); skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
++priv->stats.tx_dropped; ++priv->stats.tx_dropped;
++priv->stats.tx_errors; ++priv->stats.tx_errors;
dev_kfree_skb_any(skb); ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
return; return;
} }
...@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, ...@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) { if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n"); ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
netif_stop_queue(dev); netif_stop_queue(dev);
set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
} }
} }
} }
...@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev) ...@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
return -1; return -1;
} }
ret = ipoib_cm_dev_open(dev);
if (ret) {
ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
ipoib_ib_dev_stop(dev);
return -1;
}
clear_bit(IPOIB_STOP_REAPER, &priv->flags); clear_bit(IPOIB_STOP_REAPER, &priv->flags);
queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ); queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);
...@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev) ...@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags); clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);
ipoib_cm_dev_stop(dev);
/* /*
* Move our QP to the error state and then reinitialize in * Move our QP to the error state and then reinitialize in
* when all work requests have completed or have been flushed. * when all work requests have completed or have been flushed.
......
...@@ -49,8 +49,6 @@ ...@@ -49,8 +49,6 @@
#include <net/dst.h> #include <net/dst.h>
#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
MODULE_AUTHOR("Roland Dreier"); MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL"); MODULE_LICENSE("Dual BSD/GPL");
...@@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev) ...@@ -145,6 +143,8 @@ static int ipoib_stop(struct net_device *dev)
netif_stop_queue(dev); netif_stop_queue(dev);
clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
/* /*
* Now flush workqueue to make sure a scheduled task doesn't * Now flush workqueue to make sure a scheduled task doesn't
* bring our internal state back up. * bring our internal state back up.
...@@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) ...@@ -178,8 +178,18 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{ {
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) /* dev->mtu > 2K ==> connected mode */
if (ipoib_cm_admin_enabled(dev) && new_mtu <= IPOIB_CM_MTU) {
if (new_mtu > priv->mcast_mtu)
ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
priv->mcast_mtu);
dev->mtu = new_mtu;
return 0;
}
if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN) {
return -EINVAL; return -EINVAL;
}
priv->admin_mtu = new_mtu; priv->admin_mtu = new_mtu;
...@@ -414,6 +424,20 @@ static void path_rec_completion(int status, ...@@ -414,6 +424,20 @@ static void path_rec_completion(int status,
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
sizeof(union ib_gid)); sizeof(union ib_gid));
if (ipoib_cm_enabled(dev, neigh->neighbour)) {
if (!ipoib_cm_get(neigh))
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
path,
neigh));
if (!ipoib_cm_get(neigh)) {
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
continue;
}
}
while ((skb = __skb_dequeue(&neigh->queue))) while ((skb = __skb_dequeue(&neigh->queue)))
__skb_queue_tail(&skqueue, skb); __skb_queue_tail(&skqueue, skb);
} }
...@@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) ...@@ -520,7 +544,25 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw, memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
sizeof(union ib_gid)); sizeof(union ib_gid));
ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha)); if (ipoib_cm_enabled(dev, neigh->neighbour)) {
if (!ipoib_cm_get(neigh))
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
if (!ipoib_cm_get(neigh)) {
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
goto err_drop;
}
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
__skb_queue_tail(&neigh->queue, skb);
else {
ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
skb_queue_len(&neigh->queue));
goto err_drop;
}
} else
ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
} else { } else {
neigh->ah = NULL; neigh->ah = NULL;
...@@ -538,6 +580,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev) ...@@ -538,6 +580,7 @@ static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
err_path: err_path:
ipoib_neigh_free(dev, neigh); ipoib_neigh_free(dev, neigh);
err_drop:
++priv->stats.tx_dropped; ++priv->stats.tx_dropped;
dev_kfree_skb_any(skb); dev_kfree_skb_any(skb);
...@@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -640,7 +683,12 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
neigh = *to_ipoib_neigh(skb->dst->neighbour); neigh = *to_ipoib_neigh(skb->dst->neighbour);
if (likely(neigh->ah)) { if (ipoib_cm_get(neigh)) {
if (ipoib_cm_up(neigh)) {
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
goto out;
}
} else if (neigh->ah) {
if (unlikely(memcmp(&neigh->dgid.raw, if (unlikely(memcmp(&neigh->dgid.raw,
skb->dst->neighbour->ha + 4, skb->dst->neighbour->ha + 4,
sizeof(union ib_gid)))) { sizeof(union ib_gid)))) {
...@@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour) ...@@ -805,6 +853,7 @@ struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour)
neigh->neighbour = neighbour; neigh->neighbour = neighbour;
*to_ipoib_neigh(neighbour) = neigh; *to_ipoib_neigh(neighbour) = neigh;
skb_queue_head_init(&neigh->queue); skb_queue_head_init(&neigh->queue);
ipoib_cm_set(neigh, NULL);
return neigh; return neigh;
} }
...@@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh) ...@@ -818,6 +867,8 @@ void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
++priv->stats.tx_dropped; ++priv->stats.tx_dropped;
dev_kfree_skb_any(skb); dev_kfree_skb_any(skb);
} }
if (ipoib_cm_get(neigh))
ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
kfree(neigh); kfree(neigh);
} }
...@@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format, ...@@ -1080,6 +1131,8 @@ static struct net_device *ipoib_add_port(const char *format,
ipoib_create_debug_files(priv->dev); ipoib_create_debug_files(priv->dev);
if (ipoib_cm_add_mode_attr(priv->dev))
goto sysfs_failed;
if (ipoib_add_pkey_attr(priv->dev)) if (ipoib_add_pkey_attr(priv->dev))
goto sysfs_failed; goto sysfs_failed;
if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
......
...@@ -597,7 +597,9 @@ void ipoib_mcast_join_task(struct work_struct *work) ...@@ -597,7 +597,9 @@ void ipoib_mcast_join_task(struct work_struct *work)
priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) - priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
IPOIB_ENCAP_LEN; IPOIB_ENCAP_LEN;
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
if (!ipoib_cm_admin_enabled(dev))
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n"); ipoib_dbg_mcast(priv, "successfully joined all multicast groups\n");
......
...@@ -168,35 +168,41 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) ...@@ -168,35 +168,41 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
.qp_type = IB_QPT_UD .qp_type = IB_QPT_UD
}; };
int ret, size;
priv->pd = ib_alloc_pd(priv->ca); priv->pd = ib_alloc_pd(priv->ca);
if (IS_ERR(priv->pd)) { if (IS_ERR(priv->pd)) {
printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name); printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
return -ENODEV; return -ENODEV;
} }
priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
ipoib_sendq_size + ipoib_recvq_size + 1); if (IS_ERR(priv->mr)) {
printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
goto out_free_pd;
}
size = ipoib_sendq_size + ipoib_recvq_size + 1;
ret = ipoib_cm_dev_init(dev);
if (!ret)
size += ipoib_recvq_size;
priv->cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size);
if (IS_ERR(priv->cq)) { if (IS_ERR(priv->cq)) {
printk(KERN_WARNING "%s: failed to create CQ\n", ca->name); printk(KERN_WARNING "%s: failed to create CQ\n", ca->name);
goto out_free_pd; goto out_free_mr;
} }
if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP)) if (ib_req_notify_cq(priv->cq, IB_CQ_NEXT_COMP))
goto out_free_cq; goto out_free_cq;
priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(priv->mr)) {
printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
goto out_free_cq;
}
init_attr.send_cq = priv->cq; init_attr.send_cq = priv->cq;
init_attr.recv_cq = priv->cq, init_attr.recv_cq = priv->cq,
priv->qp = ib_create_qp(priv->pd, &init_attr); priv->qp = ib_create_qp(priv->pd, &init_attr);
if (IS_ERR(priv->qp)) { if (IS_ERR(priv->qp)) {
printk(KERN_WARNING "%s: failed to create QP\n", ca->name); printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
goto out_free_mr; goto out_free_cq;
} }
priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff; priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
...@@ -212,12 +218,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) ...@@ -212,12 +218,12 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
return 0; return 0;
out_free_mr:
ib_dereg_mr(priv->mr);
out_free_cq: out_free_cq:
ib_destroy_cq(priv->cq); ib_destroy_cq(priv->cq);
out_free_mr:
ib_dereg_mr(priv->mr);
out_free_pd: out_free_pd:
ib_dealloc_pd(priv->pd); ib_dealloc_pd(priv->pd);
return -ENODEV; return -ENODEV;
...@@ -235,12 +241,14 @@ void ipoib_transport_dev_cleanup(struct net_device *dev) ...@@ -235,12 +241,14 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)
clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
} }
if (ib_dereg_mr(priv->mr))
ipoib_warn(priv, "ib_dereg_mr failed\n");
if (ib_destroy_cq(priv->cq)) if (ib_destroy_cq(priv->cq))
ipoib_warn(priv, "ib_cq_destroy failed\n"); ipoib_warn(priv, "ib_cq_destroy failed\n");
ipoib_cm_dev_cleanup(dev);
if (ib_dereg_mr(priv->mr))
ipoib_warn(priv, "ib_dereg_mr failed\n");
if (ib_dealloc_pd(priv->pd)) if (ib_dealloc_pd(priv->pd))
ipoib_warn(priv, "ib_dealloc_pd failed\n"); ipoib_warn(priv, "ib_dealloc_pd failed\n");
} }
......
...@@ -115,6 +115,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ...@@ -115,6 +115,8 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
ipoib_create_debug_files(priv->dev); ipoib_create_debug_files(priv->dev);
if (ipoib_cm_add_mode_attr(priv->dev))
goto sysfs_failed;
if (ipoib_add_pkey_attr(priv->dev)) if (ipoib_add_pkey_attr(priv->dev))
goto sysfs_failed; goto sysfs_failed;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment