Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband

Pull infiniband reverts from Roland Dreier: "Last minute InfiniBand/RDMA changes for 3.19: - Revert IPoIB driver back to 3.18 state. We had a number of fixes go into 3.19, but they introduced regressions. We tried to get everything fixed up but ran out of time, so we'll try again for 3.20. - Similarly, turn off the new "extended query port" verb. Late in the cycle we realized the ABI is not quite right, and rather than freeze something in a rush and make a mistake, we'll take a bit more time and get it right in 3.20" * tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: IB/core: Temporarily disable ex_query_device uverb Revert "IPoIB: Consolidate rtnl_lock tasks in workqueue" Revert "IPoIB: Make the carrier_on_task race aware" Revert "IPoIB: fix MCAST_FLAG_BUSY usage" Revert "IPoIB: fix mcast_dev_flush/mcast_restart_task race" Revert "IPoIB: change init sequence ordering" Revert "IPoIB: Use dedicated workqueues per interface" Revert "IPoIB: Make ipoib_mcast_stop_thread flush the workqueue" Revert "IPoIB: No longer use flush as a parameter"

Merge tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband
Pull infiniband reverts from Roland Dreier: "Last minute InfiniBand/RDMA changes for 3.19: - Revert IPoIB driver back to 3.18 state. We had a number of fixes go into 3.19, but they introduced regressions. We tried to get everything fixed up but ran out of time, so we'll try again for 3.20. - Similarly, turn off the new "extended query port" verb. Late in the cycle we realized the ABI is not quite right, and rather than freeze something in a rush and make a mistake, we'll take a bit more time and get it right in 3.20" * tag 'rdma-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: IB/core: Temporarily disable ex_query_device uverb Revert "IPoIB: Consolidate rtnl_lock tasks in workqueue" Revert "IPoIB: Make the carrier_on_task race aware" Revert "IPoIB: fix MCAST_FLAG_BUSY usage" Revert "IPoIB: fix mcast_dev_flush/mcast_restart_task race" Revert "IPoIB: change init sequence ordering" Revert "IPoIB: Use dedicated workqueues per interface" Revert "IPoIB: Make ipoib_mcast_stop_thread flush the workqueue" Revert "IPoIB: No longer use flush as a parameter"
dc6d6844 · Linus Torvalds · 59acf657 · ecb7b123 · dc6d6844 · dc6d6844
Commit dc6d6844 authored Feb 03, 2015 by Linus Torvalds
7 changed files
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -123,7 +123,6 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
 				    struct ib_udata *uhw) = {
 	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
 	[IB_USER_VERBS_EX_CMD_DESTROY_FLOW]	= ib_uverbs_ex_destroy_flow,
-	[IB_USER_VERBS_EX_CMD_QUERY_DEVICE]	= ib_uverbs_ex_query_device
 };

 static void ib_uverbs_add_one(struct ib_device *device);

--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -98,15 +98,9 @@ enum {

 	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
 	IPOIB_MCAST_FLAG_SENDONLY = 1,
-	/*
-	 * For IPOIB_MCAST_FLAG_BUSY
-	 * When set, in flight join and mcast->mc is unreliable
-	 * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
-	 *   haven't started yet
-	 * When clear and mcast->mc is valid pointer, join was successful
-	 */
-	IPOIB_MCAST_FLAG_BUSY	  = 2,
+	IPOIB_MCAST_FLAG_BUSY	  = 2,	/* joining or already joined */
 	IPOIB_MCAST_FLAG_ATTACHED = 3,
+	IPOIB_MCAST_JOIN_STARTED  = 4,

 	MAX_SEND_CQE		  = 16,
 	IPOIB_CM_COPYBREAK	  = 256,
@@ -323,7 +317,6 @@ struct ipoib_dev_priv {
 	struct list_head multicast_list;
 	struct rb_root multicast_tree;

-	struct workqueue_struct *wq;
 	struct delayed_work mcast_task;
 	struct work_struct carrier_on_task;
 	struct work_struct flush_light;
@@ -484,10 +477,10 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work);
 void ipoib_pkey_event(struct work_struct *work);
 void ipoib_ib_dev_cleanup(struct net_device *dev);

-int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_open(struct net_device *dev, int flush);
 int ipoib_ib_dev_up(struct net_device *dev);
-int ipoib_ib_dev_down(struct net_device *dev);
-int ipoib_ib_dev_stop(struct net_device *dev);
+int ipoib_ib_dev_down(struct net_device *dev, int flush);
+int ipoib_ib_dev_stop(struct net_device *dev, int flush);
 void ipoib_pkey_dev_check_presence(struct net_device *dev);

 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
@@ -499,7 +492,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);

 void ipoib_mcast_restart_task(struct work_struct *work);
 int ipoib_mcast_start_thread(struct net_device *dev);
-int ipoib_mcast_stop_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev, int flush);

 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);

--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -474,7 +474,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	}

 	spin_lock_irq(&priv->lock);
-	queue_delayed_work(priv->wq,
+	queue_delayed_work(ipoib_workqueue,
 			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
 	/* Add this entry to passive ids list head, but do not re-add it
 	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
@@ -576,7 +576,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 			spin_lock_irqsave(&priv->lock, flags);
 			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			ipoib_cm_start_rx_drain(priv);
-			queue_work(priv->wq, &priv->cm.rx_reap_task);
+			queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
 			spin_unlock_irqrestore(&priv->lock, flags);
 		} else
 			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
@@ -603,7 +603,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 				spin_lock_irqsave(&priv->lock, flags);
 				list_move(&p->list, &priv->cm.rx_reap_list);
 				spin_unlock_irqrestore(&priv->lock, flags);
-				queue_work(priv->wq, &priv->cm.rx_reap_task);
+				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
 			}
 			return;
 		}
@@ -827,7 +827,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)

 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 			list_move(&tx->list, &priv->cm.reap_list);
-			queue_work(priv->wq, &priv->cm.reap_task);
+			queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		}

 		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
@@ -1255,7 +1255,7 @@ static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,

 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 			list_move(&tx->list, &priv->cm.reap_list);
-			queue_work(priv->wq, &priv->cm.reap_task);
+			queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		}

 		spin_unlock_irqrestore(&priv->lock, flags);
@@ -1284,7 +1284,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path
 	tx->dev = dev;
 	list_add(&tx->list, &priv->cm.start_list);
 	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
-	queue_work(priv->wq, &priv->cm.start_task);
+	queue_work(ipoib_workqueue, &priv->cm.start_task);
 	return tx;
 }

@@ -1295,7 +1295,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
 	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
 		spin_lock_irqsave(&priv->lock, flags);
 		list_move(&tx->list, &priv->cm.reap_list);
-		queue_work(priv->wq, &priv->cm.reap_task);
+		queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
 			  tx->neigh->daddr + 4);
 		tx->neigh = NULL;
@@ -1417,7 +1417,7 @@ void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,

 	skb_queue_tail(&priv->cm.skb_queue, skb);
 	if (e)
-		queue_work(priv->wq, &priv->cm.skb_task);
+		queue_work(ipoib_workqueue, &priv->cm.skb_task);
 }

 static void ipoib_cm_rx_reap(struct work_struct *work)
@@ -1450,7 +1450,7 @@ static void ipoib_cm_stale_task(struct work_struct *work)
 	}

 	if (!list_empty(&priv->cm.passive_ids))
-		queue_delayed_work(priv->wq,
+		queue_delayed_work(ipoib_workqueue,
 				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
 	spin_unlock_irq(&priv->lock);
 }

--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -655,7 +655,7 @@ void ipoib_reap_ah(struct work_struct *work)
 	__ipoib_reap_ah(dev);

 	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
-		queue_delayed_work(priv->wq, &priv->ah_reap_task,
+		queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
 				   round_jiffies_relative(HZ));
 }

@@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx)
 	drain_tx_cq((struct net_device *)ctx);
 }

-int ipoib_ib_dev_open(struct net_device *dev)
+int ipoib_ib_dev_open(struct net_device *dev, int flush)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	int ret;
@@ -696,7 +696,7 @@ int ipoib_ib_dev_open(struct net_device *dev)
 	}

 	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
-	queue_delayed_work(priv->wq, &priv->ah_reap_task,
+	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task,
 			   round_jiffies_relative(HZ));

 	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
@@ -706,7 +706,7 @@ int ipoib_ib_dev_open(struct net_device *dev)
 dev_stop:
 	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
 		napi_enable(&priv->napi);
-	ipoib_ib_dev_stop(dev);
+	ipoib_ib_dev_stop(dev, flush);
 	return -1;
 }

@@ -738,7 +738,7 @@ int ipoib_ib_dev_up(struct net_device *dev)
 	return ipoib_mcast_start_thread(dev);
 }

-int ipoib_ib_dev_down(struct net_device *dev)
+int ipoib_ib_dev_down(struct net_device *dev, int flush)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);

@@ -747,7 +747,7 @@ int ipoib_ib_dev_down(struct net_device *dev)
 	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
 	netif_carrier_off(dev);

-	ipoib_mcast_stop_thread(dev);
+	ipoib_mcast_stop_thread(dev, flush);
 	ipoib_mcast_dev_flush(dev);

 	ipoib_flush_paths(dev);
@@ -807,7 +807,7 @@ void ipoib_drain_cq(struct net_device *dev)
 	local_bh_enable();
 }

-int ipoib_ib_dev_stop(struct net_device *dev)
+int ipoib_ib_dev_stop(struct net_device *dev, int flush)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ib_qp_attr qp_attr;
@@ -880,7 +880,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)
 	/* Wait for all AHs to be reaped */
 	set_bit(IPOIB_STOP_REAPER, &priv->flags);
 	cancel_delayed_work(&priv->ah_reap_task);
-	flush_workqueue(priv->wq);
+	if (flush)
+		flush_workqueue(ipoib_workqueue);

 	begin = jiffies;

@@ -917,7 +918,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 		    (unsigned long) dev);

 	if (dev->flags & IFF_UP) {
-		if (ipoib_ib_dev_open(dev)) {
+		if (ipoib_ib_dev_open(dev, 1)) {
 			ipoib_transport_dev_cleanup(dev);
 			return -ENODEV;
 		}
@@ -1039,12 +1040,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
 	}

 	if (level >= IPOIB_FLUSH_NORMAL)
-		ipoib_ib_dev_down(dev);
+		ipoib_ib_dev_down(dev, 0);

 	if (level == IPOIB_FLUSH_HEAVY) {
 		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
-			ipoib_ib_dev_stop(dev);
-		if (ipoib_ib_dev_open(dev) != 0)
+			ipoib_ib_dev_stop(dev, 0);
+		if (ipoib_ib_dev_open(dev, 0) != 0)
 			return;
 		if (netif_queue_stopped(dev))
 			netif_start_queue(dev);
@@ -1096,7 +1097,7 @@ void ipoib_ib_dev_cleanup(struct net_device *dev)
 	 */
 	ipoib_flush_paths(dev);

-	ipoib_mcast_stop_thread(dev);
+	ipoib_mcast_stop_thread(dev, 1);
 	ipoib_mcast_dev_flush(dev);

 	ipoib_transport_dev_cleanup(dev);

--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -108,7 +108,7 @@ int ipoib_open(struct net_device *dev)

 	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

-	if (ipoib_ib_dev_open(dev)) {
+	if (ipoib_ib_dev_open(dev, 1)) {
 		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
 			return 0;
 		goto err_disable;
@@ -139,7 +139,7 @@ int ipoib_open(struct net_device *dev)
 	return 0;

 err_stop:
-	ipoib_ib_dev_stop(dev);
+	ipoib_ib_dev_stop(dev, 1);

 err_disable:
 	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
@@ -157,8 +157,8 @@ static int ipoib_stop(struct net_device *dev)

 	netif_stop_queue(dev);

-	ipoib_ib_dev_down(dev);
-	ipoib_ib_dev_stop(dev);
+	ipoib_ib_dev_down(dev, 1);
+	ipoib_ib_dev_stop(dev, 0);

 	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
 		struct ipoib_dev_priv *cpriv;
@@ -839,7 +839,7 @@ static void ipoib_set_mcast_list(struct net_device *dev)
 		return;
 	}

-	queue_work(priv->wq, &priv->restart_task);
+	queue_work(ipoib_workqueue, &priv->restart_task);
 }

 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
@@ -954,7 +954,7 @@ static void ipoib_reap_neigh(struct work_struct *work)
 	__ipoib_reap_neigh(priv);

 	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
-		queue_delayed_work(priv->wq, &priv->neigh_reap_task,
+		queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
 				   arp_tbl.gc_interval);
 }

@@ -1133,7 +1133,7 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)

 	/* start garbage collection */
 	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
-	queue_delayed_work(priv->wq, &priv->neigh_reap_task,
+	queue_delayed_work(ipoib_workqueue, &priv->neigh_reap_task,
 			   arp_tbl.gc_interval);

 	return 0;
@@ -1262,13 +1262,15 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);

+	if (ipoib_neigh_hash_init(priv) < 0)
+		goto out;
 	/* Allocate RX/TX "rings" to hold queued skbs */
 	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
 				GFP_KERNEL);
 	if (!priv->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
 		       ca->name, ipoib_recvq_size);
-		goto out;
+		goto out_neigh_hash_cleanup;
 	}

 	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
@@ -1283,24 +1285,16 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
 	if (ipoib_ib_dev_init(dev, ca, port))
 		goto out_tx_ring_cleanup;

-	/*
-	 * Must be after ipoib_ib_dev_init so we can allocate a per
-	 * device wq there and use it here
-	 */
-	if (ipoib_neigh_hash_init(priv) < 0)
-		goto out_dev_uninit;
-
 	return 0;

-out_dev_uninit:
-	ipoib_ib_dev_cleanup(dev);
-
 out_tx_ring_cleanup:
 	vfree(priv->tx_ring);

 out_rx_ring_cleanup:
 	kfree(priv->rx_ring);

+out_neigh_hash_cleanup:
+	ipoib_neigh_hash_uninit(dev);
 out:
 	return -ENOMEM;
 }
@@ -1323,12 +1317,6 @@ void ipoib_dev_cleanup(struct net_device *dev)
 	}
 	unregister_netdevice_many(&head);

-	/*
-	 * Must be before ipoib_ib_dev_cleanup or we delete an in use
-	 * work queue
-	 */
-	ipoib_neigh_hash_uninit(dev);
-
 	ipoib_ib_dev_cleanup(dev);

 	kfree(priv->rx_ring);
@@ -1336,6 +1324,8 @@ void ipoib_dev_cleanup(struct net_device *dev)

 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
+
+	ipoib_neigh_hash_uninit(dev);
 }

 static const struct header_ops ipoib_header_ops = {
@@ -1646,7 +1636,7 @@ static struct net_device *ipoib_add_port(const char *format,
 	/* Stop GC if started before flush */
 	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
 	cancel_delayed_work(&priv->neigh_reap_task);
-	flush_workqueue(priv->wq);
+	flush_workqueue(ipoib_workqueue);

 event_failed:
 	ipoib_dev_cleanup(priv->dev);
@@ -1717,7 +1707,7 @@ static void ipoib_remove_one(struct ib_device *device)
 		/* Stop GC */
 		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
 		cancel_delayed_work(&priv->neigh_reap_task);
-		flush_workqueue(priv->wq);
+		flush_workqueue(ipoib_workqueue);

 		unregister_netdev(priv->dev);
 		free_netdev(priv->dev);
@@ -1758,13 +1748,8 @@ static int __init ipoib_init_module(void)
 	 * unregister_netdev() and linkwatch_event take the rtnl lock,
 	 * so flush_scheduled_work() can deadlock during device
 	 * removal.
-	 *
-	 * In addition, bringing one device up and another down at the
-	 * same time can deadlock a single workqueue, so we have this
-	 * global fallback workqueue, but we also attempt to open a
-	 * per device workqueue each time we bring an interface up
 	 */
-	ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
+	ipoib_workqueue = create_singlethread_workqueue("ipoib");
 	if (!ipoib_workqueue) {
 		ret = -ENOMEM;
 		goto err_fs;

--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -190,6 +190,12 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 		spin_unlock_irq(&priv->lock);
 		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
 		set_qkey = 1;
+
+		if (!ipoib_cm_admin_enabled(dev)) {
+			rtnl_lock();
+			dev_set_mtu(dev, min(priv->mcast_mtu, priv->admin_mtu));
+			rtnl_unlock();
+		}
 	}

 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -271,27 +277,16 @@ ipoib_mcast_sendonly_join_complete(int status,
 	struct ipoib_mcast *mcast = multicast->context;
 	struct net_device *dev = mcast->dev;

-	/*
-	 * We have to take the mutex to force mcast_sendonly_join to
-	 * return from ib_sa_multicast_join and set mcast->mc to a
-	 * valid value.  Otherwise we were racing with ourselves in
-	 * that we might fail here, but get a valid return from
-	 * ib_sa_multicast_join after we had cleared mcast->mc here,
-	 * resulting in mis-matched joins and leaves and a deadlock
-	 */
-	mutex_lock(&mcast_mutex);
-
 	/* We trap for port events ourselves. */
 	if (status == -ENETRESET)
-		goto out;
+		return 0;

 	if (!status)
 		status = ipoib_mcast_join_finish(mcast, &multicast->rec);

 	if (status) {
 		if (mcast->logcount++ < 20)
-			ipoib_dbg_mcast(netdev_priv(dev), "sendonly multicast "
-					"join failed for %pI6, status %d\n",
+			ipoib_dbg_mcast(netdev_priv(dev), "multicast join failed for %pI6, status %d\n",
 					mcast->mcmember.mgid.raw, status);

 		/* Flush out any queued packets */
@@ -301,15 +296,11 @@ ipoib_mcast_sendonly_join_complete(int status,
 			dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
 		}
 		netif_tx_unlock_bh(dev);
+
+		/* Clear the busy flag so we try again */
+		status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY,
+					    &mcast->flags);
 	}
-out:
-	clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-	if (status)
-		mcast->mc = NULL;
-	complete(&mcast->done);
-	if (status == -ENETRESET)
-		status = 0;
-	mutex_unlock(&mcast_mutex);
 	return status;
 }

@@ -327,14 +318,12 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 	int ret = 0;

 	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
-		ipoib_dbg_mcast(priv, "device shutting down, no sendonly "
-				"multicast joins\n");
+		ipoib_dbg_mcast(priv, "device shutting down, no multicast joins\n");
 		return -ENODEV;
 	}

-	if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
-		ipoib_dbg_mcast(priv, "multicast entry busy, skipping "
-				"sendonly join\n");
+	if (test_and_set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "multicast entry busy, skipping\n");
 		return -EBUSY;
 	}

@@ -342,9 +331,6 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 	rec.port_gid = priv->local_gid;
 	rec.pkey     = cpu_to_be16(priv->pkey);

-	mutex_lock(&mcast_mutex);
-	init_completion(&mcast->done);
-	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
 	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca,
 					 priv->port, &rec,
 					 IB_SA_MCMEMBER_REC_MGID	|
@@ -357,14 +343,12 @@ static int ipoib_mcast_sendonly_join(struct ipoib_mcast *mcast)
 	if (IS_ERR(mcast->mc)) {
 		ret = PTR_ERR(mcast->mc);
 		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-		complete(&mcast->done);
-		ipoib_warn(priv, "ib_sa_join_multicast for sendonly join "
-			   "failed (ret = %d)\n", ret);
+		ipoib_warn(priv, "ib_sa_join_multicast failed (ret = %d)\n",
+			   ret);
 	} else {
-		ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting "
-				"sendonly join\n", mcast->mcmember.mgid.raw);
+		ipoib_dbg_mcast(priv, "no multicast record for %pI6, starting join\n",
+				mcast->mcmember.mgid.raw);
 	}
-	mutex_unlock(&mcast_mutex);

 	return ret;
 }
@@ -375,29 +359,18 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work)
 						   carrier_on_task);
 	struct ib_port_attr attr;

-	if (ib_query_port(priv->ca, priv->port, &attr) ||
-	    attr.state != IB_PORT_ACTIVE) {
-		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
-		return;
-	}
-
 	/*
 	 * Take rtnl_lock to avoid racing with ipoib_stop() and
 	 * turning the carrier back on while a device is being
-	 * removed.  However, ipoib_stop() will attempt to flush
-	 * the workqueue while holding the rtnl lock, so loop
-	 * on trylock until either we get the lock or we see
-	 * FLAG_ADMIN_UP go away as that signals that we are bailing
-	 * and can safely ignore the carrier on work.
+	 * removed.
 	 */
-	while (!rtnl_trylock()) {
-		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+	if (ib_query_port(priv->ca, priv->port, &attr) ||
+	    attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
 		return;
-		else
-			msleep(20);
 	}
-	if (!ipoib_cm_admin_enabled(priv->dev))
-		dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
+
+	rtnl_lock();
 	netif_carrier_on(priv->dev);
 	rtnl_unlock();
 }
@@ -412,35 +385,34 @@ static int ipoib_mcast_join_complete(int status,
 	ipoib_dbg_mcast(priv, "join completion for %pI6 (status %d)\n",
 			mcast->mcmember.mgid.raw, status);

-	/*
-	 * We have to take the mutex to force mcast_join to
-	 * return from ib_sa_multicast_join and set mcast->mc to a
-	 * valid value.  Otherwise we were racing with ourselves in
-	 * that we might fail here, but get a valid return from
-	 * ib_sa_multicast_join after we had cleared mcast->mc here,
-	 * resulting in mis-matched joins and leaves and a deadlock
-	 */
-	mutex_lock(&mcast_mutex);
-
 	/* We trap for port events ourselves. */
-	if (status == -ENETRESET)
+	if (status == -ENETRESET) {
+		status = 0;
 		goto out;
+	}

 	if (!status)
 		status = ipoib_mcast_join_finish(mcast, &multicast->rec);

 	if (!status) {
 		mcast->backoff = 1;
+		mutex_lock(&mcast_mutex);
 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-			queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task, 0);
+		mutex_unlock(&mcast_mutex);

 		/*
-		 * Defer carrier on work to priv->wq to avoid a
+		 * Defer carrier on work to ipoib_workqueue to avoid a
 		 * deadlock on rtnl_lock here.
 		 */
 		if (mcast == priv->broadcast)
-			queue_work(priv->wq, &priv->carrier_on_task);
-	} else {
+			queue_work(ipoib_workqueue, &priv->carrier_on_task);
+
+		status = 0;
+		goto out;
+	}
+
 	if (mcast->logcount++ < 20) {
 		if (status == -ETIMEDOUT || status == -EAGAIN) {
 			ipoib_dbg_mcast(priv, "multicast join failed for %pI6, status %d\n",
@@ -454,21 +426,19 @@ static int ipoib_mcast_join_complete(int status,
 	mcast->backoff *= 2;
 	if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
 		mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
-	}
-out:
+
+	/* Clear the busy flag so we try again */
+	status = test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+
+	mutex_lock(&mcast_mutex);
 	spin_lock_irq(&priv->lock);
-	clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
-	if (status)
-		mcast->mc = NULL;
-	complete(&mcast->done);
-	if (status == -ENETRESET)
-		status = 0;
-	if (status && test_bit(IPOIB_MCAST_RUN, &priv->flags))
-		queue_delayed_work(priv->wq, &priv->mcast_task,
+	if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_task,
 				   mcast->backoff * HZ);
 	spin_unlock_irq(&priv->lock);
 	mutex_unlock(&mcast_mutex);
-
+out:
+	complete(&mcast->done);
 	return status;
 }

@@ -517,9 +487,10 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
 	}

-	mutex_lock(&mcast_mutex);
-	init_completion(&mcast->done);
 	set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	init_completion(&mcast->done);
+	set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags);
+
 	mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
 					 &rec, comp_mask, GFP_KERNEL,
 					 ipoib_mcast_join_complete, mcast);
@@ -533,11 +504,13 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
 		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
 			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;

+		mutex_lock(&mcast_mutex);
 		if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-			queue_delayed_work(priv->wq, &priv->mcast_task,
+			queue_delayed_work(ipoib_workqueue,
+					   &priv->mcast_task,
 					   mcast->backoff * HZ);
-	}
 		mutex_unlock(&mcast_mutex);
+	}
 }

 void ipoib_mcast_join_task(struct work_struct *work)
@@ -574,8 +547,8 @@ void ipoib_mcast_join_task(struct work_struct *work)
 			ipoib_warn(priv, "failed to allocate broadcast group\n");
 			mutex_lock(&mcast_mutex);
 			if (test_bit(IPOIB_MCAST_RUN, &priv->flags))
-				queue_delayed_work(priv->wq, &priv->mcast_task,
-						   HZ);
+				queue_delayed_work(ipoib_workqueue,
+						   &priv->mcast_task, HZ);
 			mutex_unlock(&mcast_mutex);
 			return;
 		}
@@ -590,8 +563,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	}

 	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
-		if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
-		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
+		if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags))
 			ipoib_mcast_join(dev, priv->broadcast, 0);
 		return;
 	}
@@ -599,32 +571,22 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	while (1) {
 		struct ipoib_mcast *mcast = NULL;

-		/*
-		 * Need the mutex so our flags are consistent, need the
-		 * priv->lock so we don't race with list removals in either
-		 * mcast_dev_flush or mcast_restart_task
-		 */
-		mutex_lock(&mcast_mutex);
 		spin_lock_irq(&priv->lock);
 		list_for_each_entry(mcast, &priv->multicast_list, list) {
-			if (IS_ERR_OR_NULL(mcast->mc) &&
-			    !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
-			    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)
+			    && !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
 				/* Found the next unjoined group */
 				break;
 			}
 		}
 		spin_unlock_irq(&priv->lock);
-		mutex_unlock(&mcast_mutex);

 		if (&mcast->list == &priv->multicast_list) {
 			/* All done */
 			break;
 		}

-		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
-			ipoib_mcast_sendonly_join(mcast);
-		else
 		ipoib_mcast_join(dev, mcast, 1);
 		return;
 	}
@@ -642,13 +604,13 @@ int ipoib_mcast_start_thread(struct net_device *dev)

 	mutex_lock(&mcast_mutex);
 	if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-		queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+		queue_delayed_work(ipoib_workqueue, &priv->mcast_task, 0);
 	mutex_unlock(&mcast_mutex);

 	return 0;
 }

-int ipoib_mcast_stop_thread(struct net_device *dev)
+int ipoib_mcast_stop_thread(struct net_device *dev, int flush)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);

@@ -659,7 +621,8 @@ int ipoib_mcast_stop_thread(struct net_device *dev)
 	cancel_delayed_work(&priv->mcast_task);
 	mutex_unlock(&mcast_mutex);

-	flush_workqueue(priv->wq);
+	if (flush)
+		flush_workqueue(ipoib_workqueue);

 	return 0;
 }
@@ -670,9 +633,6 @@ static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
 	int ret = 0;

 	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
-		ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
-
-	if (!IS_ERR_OR_NULL(mcast->mc))
 		ib_sa_free_multicast(mcast->mc);

 	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
@@ -725,8 +685,6 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 		memcpy(mcast->mcmember.mgid.raw, mgid, sizeof (union ib_gid));
 		__ipoib_mcast_add(dev, mcast);
 		list_add_tail(&mcast->list, &priv->multicast_list);
-		if (!test_and_set_bit(IPOIB_MCAST_RUN, &priv->flags))
-			queue_delayed_work(priv->wq, &priv->mcast_task, 0);
 	}

 	if (!mcast->ah) {
@@ -740,6 +698,8 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
 		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
 			ipoib_dbg_mcast(priv, "no address vector, "
 					"but multicast join already started\n");
+		else if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+			ipoib_mcast_sendonly_join(mcast);

 		/*
 		 * If lookup completes between here and out:, don't
@@ -799,12 +759,9 @@ void ipoib_mcast_dev_flush(struct net_device *dev)

 	spin_unlock_irqrestore(&priv->lock, flags);

-	/*
-	 * make sure the in-flight joins have finished before we attempt
-	 * to leave
-	 */
+	/* seperate between the wait to the leave*/
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
-		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+		if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags))
 			wait_for_completion(&mcast->done);

 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
@@ -837,6 +794,8 @@ void ipoib_mcast_restart_task(struct work_struct *work)

 	ipoib_dbg_mcast(priv, "restarting multicast task\n");

+	ipoib_mcast_stop_thread(dev, 0);
+
 	local_irq_save(flags);
 	netif_addr_lock(dev);
 	spin_lock(&priv->lock);
@@ -921,38 +880,14 @@ void ipoib_mcast_restart_task(struct work_struct *work)
 	netif_addr_unlock(dev);
 	local_irq_restore(flags);

-	/*
-	 * make sure the in-flight joins have finished before we attempt
-	 * to leave
-	 */
-	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
-		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
-			wait_for_completion(&mcast->done);
-
-	/*
-	 * We have to cancel outside of the spinlock, but we have to
-	 * take the rtnl lock or else we race with the removal of
-	 * entries from the remove list in mcast_dev_flush as part
-	 * of ipoib_stop().  We detect the drop of the ADMIN_UP flag
-	 * to signal that we have hit this particular race, and we
-	 * return since we know we don't need to do anything else
-	 * anyway.
-	 */
-	while (!rtnl_trylock()) {
-		if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
-			return;
-		else
-			msleep(20);
-	}
+	/* We have to cancel outside of the spinlock */
 	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
 		ipoib_mcast_leave(mcast->dev, mcast);
 		ipoib_mcast_free(mcast);
 	}
-	/*
-	 * Restart our join task if needed
-	 */
+
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
 		ipoib_mcast_start_thread(dev);
-	rtnl_unlock();
 }

 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -145,20 +145,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	int ret, size;
 	int i;

-	/*
-	 * the various IPoIB tasks assume they will never race against
-	 * themselves, so always use a single thread workqueue
-	 */
-	priv->wq = create_singlethread_workqueue("ipoib_wq");
-	if (!priv->wq) {
-		printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
-		return -ENODEV;
-	}
-
 	priv->pd = ib_alloc_pd(priv->ca);
 	if (IS_ERR(priv->pd)) {
 		printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
-		goto out_free_wq;
+		return -ENODEV;
 	}

 	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
@@ -252,10 +242,6 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)

 out_free_pd:
 	ib_dealloc_pd(priv->pd);
-
-out_free_wq:
-	destroy_workqueue(priv->wq);
-	priv->wq = NULL;
 	return -ENODEV;
 }

@@ -284,12 +270,6 @@ void ipoib_transport_dev_cleanup(struct net_device *dev)

 	if (ib_dealloc_pd(priv->pd))
 		ipoib_warn(priv, "ib_dealloc_pd failed\n");
-
-	if (priv->wq) {
-		flush_workqueue(priv->wq);
-		destroy_workqueue(priv->wq);
-		priv->wq = NULL;
-	}
 }

 void ipoib_event(struct ib_event_handler *handler,