[SCSI] fc_transport: make all rports wait dev_loss_tmo before removing them

Per the comment in the change - it's not always prudent to immediately remove the rport upon first notice of a disconnect. Make all rports wait dev_loss_tmo before being deleted (and each could have a separate dev_loss_tmo value). The original post was: http://marc.info/?l=linux-scsi&m=117392196006703&w=2 The repost contains the following changes: - Bug fix in fc_starget_delete(). Dev_loss_tmo_callbk() was called prior to tearing down the target. The callback is to be the last thing called, as it tells the LLDD that the rport is completely finished and can be torn down. Rework so that terminate_rport_io() is called to terminate the outstanding io. Isolated work so it's is simply "starget" work. - Fix holes in original patch. There were code paths that did not expect the dev_loss_tmo timer to be running for the non-fcp rports. - Bug Fix: the transport wasn't protecting against a LLDD calling fc_remote_port_delete() back-to-back. Thus, the dev_loss_tmo timer could be restarted such that it fires after the rport had been deleted. Validate rport state before starting the timer. Signed-off-by: James Smart <James.Smart@emulex.com> Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>

[SCSI] fc_transport: make all rports wait dev_loss_tmo before removing them
Per the comment in the change - it's not always prudent to immediately remove the rport upon first notice of a disconnect. Make all rports wait dev_loss_tmo before being deleted (and each could have a separate dev_loss_tmo value). The original post was: http://marc.info/?l=linux-scsi&m=117392196006703&w=2 The repost contains the following changes: - Bug fix in fc_starget_delete(). Dev_loss_tmo_callbk() was called prior to tearing down the target. The callback is to be the last thing called, as it tells the LLDD that the rport is completely finished and can be torn down. Rework so that terminate_rport_io() is called to terminate the outstanding io. Isolated work so it's is simply "starget" work. - Fix holes in original patch. There were code paths that did not expect the dev_loss_tmo timer to be running for the non-fcp rports. - Bug Fix: the transport wasn't protecting against a LLDD calling fc_remote_port_delete() back-to-back. Thus, the dev_loss_tmo timer could be restarted such that it fires after the rport had been deleted. Validate rport state before starting the timer. Signed-off-by: James Smart <James.Smart@emulex.com> Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
92740b24 · James Smart · James Bottomley · ac09c349 · 92740b24
Commit 92740b24 authored Apr 27, 2007 by James Smart Committed by James Bottomley May 06, 2007
Hide whitespace changes
Inline Side-by-side

Showing with 97 additions and 61 deletions

drivers/scsi/scsi_transport_fc.c drivers/scsi/scsi_transport_fc.c +97 -61

No files found.
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -1718,31 +1718,12 @@ fc_starget_delete(struct work_struct *work)
 	struct fc_rport *rport =
 		container_of(work, struct fc_rport, stgt_delete_work);
 	struct Scsi_Host *shost = rport_to_shost(rport);
-	unsigned long flags;
 	struct fc_internal *i = to_fc_internal(shost->transportt);
-	/*
+	/* Involve the LLDD if possible to terminate all io on the rport. */
-	 * Involve the LLDD if possible. All io on the rport is to
+	if (i->f->terminate_rport_io)
-	 * be terminated, either as part of the dev_loss_tmo callback
-	 * processing, or via the terminate_rport_io function.
-	 */
-	if (i->f->dev_loss_tmo_callbk)
-		i->f->dev_loss_tmo_callbk(rport);
-	else if (i->f->terminate_rport_io)
 		i->f->terminate_rport_io(rport);
-	spin_lock_irqsave(shost->host_lock, flags);
-	if (rport->flags & FC_RPORT_DEVLOSS_PENDING) {
-		spin_unlock_irqrestore(shost->host_lock, flags);
-		if (!cancel_delayed_work(&rport->fail_io_work))
-			fc_flush_devloss(shost);
-		if (!cancel_delayed_work(&rport->dev_loss_work))
-			fc_flush_devloss(shost);
-		spin_lock_irqsave(shost->host_lock, flags);
-		rport->flags &= ~FC_RPORT_DEVLOSS_PENDING;
-	}
-	spin_unlock_irqrestore(shost->host_lock, flags);
 	scsi_remove_target(&rport->dev);
 }
@@ -1760,6 +1741,7 @@ fc_rport_final_delete(struct work_struct *work)
 	struct device *dev = &rport->dev;
 	struct Scsi_Host *shost = rport_to_shost(rport);
 	struct fc_internal *i = to_fc_internal(shost->transportt);
+	unsigned long flags;
 	/*
 	 * if a scan is pending, flush the SCSI Host work_q so that 
@@ -1768,13 +1750,37 @@ fc_rport_final_delete(struct work_struct *work)
 	if (rport->flags & FC_RPORT_SCAN_PENDING)
 		scsi_flush_work(shost);
+	/* involve the LLDD to terminate all pending i/o */
+	if (i->f->terminate_rport_io)
+		i->f->terminate_rport_io(rport);
+	/*
+	 * Cancel any outstanding timers. These should really exist
+	 * only when rmmod'ing the LLDD and we're asking for
+	 * immediate termination of the rports
+	 */
+	spin_lock_irqsave(shost->host_lock, flags);
+	if (rport->flags & FC_RPORT_DEVLOSS_PENDING) {
+		spin_unlock_irqrestore(shost->host_lock, flags);
+		if (!cancel_delayed_work(&rport->fail_io_work))
+			fc_flush_devloss(shost);
+		if (!cancel_delayed_work(&rport->dev_loss_work))
+			fc_flush_devloss(shost);
+		spin_lock_irqsave(shost->host_lock, flags);
+		rport->flags &= ~FC_RPORT_DEVLOSS_PENDING;
+	}
+	spin_unlock_irqrestore(shost->host_lock, flags);
 	/* Delete SCSI target and sdevs */
 	if (rport->scsi_target_id != -1)
 		fc_starget_delete(&rport->stgt_delete_work);
-	else if (i->f->dev_loss_tmo_callbk)
+	/*
+	 * Notify the driver that the rport is now dead. The LLDD will
+	 * also guarantee that any communication to the rport is terminated
+	 */
+	if (i->f->dev_loss_tmo_callbk)
 		i->f->dev_loss_tmo_callbk(rport);
-	else if (i->f->terminate_rport_io)
-		i->f->terminate_rport_io(rport);
 	transport_remove_device(dev);
 	device_del(dev);
@@ -1963,8 +1969,6 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel,
 			}
 			if (match) {
-				struct delayed_work *work =
-							&rport->dev_loss_work;
 				memcpy(&rport->node_name, &ids->node_name,
 					sizeof(rport->node_name));
@@ -1982,46 +1986,61 @@ fc_remote_port_add(struct Scsi_Host *shost, int channel,
 						fci->f->dd_fcrport_size);
 				/*
-				 * If we were blocked, we were a target.
+				 * If we were not a target, cancel the
-				 * If no longer a target, we leave the timer
+				 * io terminate and rport timers, and
-				 * running in case the port changes roles
+				 * we're done.
-				 * prior to the timer expiring. If the timer
+				 *
-				 * fires, the target will be torn down.
+				 * If we were a target, but our new role
+				 * doesn't indicate a target, leave the
+				 * timers running expecting the role to
+				 * change as the target fully logs in. If
+				 * it doesn't, the target will be torn down.
+				 *
+				 * If we were a target, and our role shows
+				 * we're still a target, cancel the timers
+				 * and kick off a scan.
 				 */
-				if (!(ids->roles & FC_RPORT_ROLE_FCP_TARGET))
-					return rport;
-				/* restart the target */
+				/* was a target, not in roles */
+				if ((rport->scsi_target_id != -1) &&
+				    (!(ids->roles & FC_RPORT_ROLE_FCP_TARGET)))
+					return rport;
 				/*
-				 * Stop the target timers first. Take no action
+				 * Stop the fail io and dev_loss timers.
-				 * on the del_timer failure as the state
+				 * If they flush, the port_state will
-				 * machine state change will validate the
+				 * be checked and will NOOP the function.
-				 * transaction.
 				 */
 				if (!cancel_delayed_work(&rport->fail_io_work))
 					fc_flush_devloss(shost);
-				if (!cancel_delayed_work(work))
+				if (!cancel_delayed_work(&rport->dev_loss_work))
 					fc_flush_devloss(shost);
 				spin_lock_irqsave(shost->host_lock, flags);
 				rport->flags &= ~FC_RPORT_DEVLOSS_PENDING;
-				/* initiate a scan of the target */
+				/* if target, initiate a scan */
-				rport->flags |= FC_RPORT_SCAN_PENDING;
+				if (rport->scsi_target_id != -1) {
-				scsi_queue_work(shost, &rport->scan_work);
+					rport->flags |= FC_RPORT_SCAN_PENDING;
+					scsi_queue_work(shost,
-				spin_unlock_irqrestore(shost->host_lock, flags);
+							&rport->scan_work);
+					spin_unlock_irqrestore(shost->host_lock,
-				scsi_target_unblock(&rport->dev);
+							flags);
+					scsi_target_unblock(&rport->dev);
+				} else
+					spin_unlock_irqrestore(shost->host_lock,
+							flags);
 				return rport;
 			}
 		}
 	}
-	/* Search the bindings array */
+	/*
+	 * Search the bindings array
+	 * Note: if never a FCP target, you won't be on this list
+	 */
 	if (fc_host->tgtid_bind_type != FC_TGTID_BIND_NONE) {
 		/* search for a matching consistent binding */
@@ -2158,15 +2177,24 @@ fc_remote_port_delete(struct fc_rport  *rport)
 	spin_lock_irqsave(shost->host_lock, flags);
-	/* If no scsi target id mapping, delete it */
+	if (rport->port_state != FC_PORTSTATE_ONLINE) {
-	if (rport->scsi_target_id == -1) {
-		list_del(&rport->peers);
-		rport->port_state = FC_PORTSTATE_DELETED;
-		fc_queue_work(shost, &rport->rport_delete_work);
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		return;
 	}
+	/*
+	 * In the past, we if this was not an FCP-Target, we would
+	 * unconditionally just jump to deleting the rport.
+	 * However, rports can be used as node containers by the LLDD,
+	 * and its not appropriate to just terminate the rport at the
+	 * first sign of a loss in connectivity. The LLDD may want to
+	 * send ELS traffic to re-validate the login. If the rport is
+	 * immediately deleted, it makes it inappropriate for a node
+	 * container.
+	 * So... we now unconditionally wait dev_loss_tmo before
+	 * destroying an rport.
+	 */
 	rport->port_state = FC_PORTSTATE_BLOCKED;
 	rport->flags |= FC_RPORT_DEVLOSS_PENDING;
@@ -2263,11 +2291,11 @@ fc_remote_port_rolechg(struct fc_rport  *rport, u32 roles)
 EXPORT_SYMBOL(fc_remote_port_rolechg);
 /**
- * fc_timeout_deleted_rport - Timeout handler for a deleted remote port that
+ * fc_timeout_deleted_rport - Timeout handler for a deleted remote port,
- *                       was a SCSI target (thus was blocked), and failed
+ * 			which we blocked, and has now failed to return
- *                       to return in the alloted time.
+ * 			in the allotted time.
 * 
- * @work:	rport target that failed to reappear in the alloted time.
+ * @work:	rport target that failed to reappear in the allotted time.
 **/
 static void
 fc_timeout_deleted_rport(struct work_struct *work)
@@ -2283,10 +2311,12 @@ fc_timeout_deleted_rport(struct work_struct *work)
 	rport->flags &= ~FC_RPORT_DEVLOSS_PENDING;
 	/*
-	 * If the port is ONLINE, then it came back. Validate it's still an
+	 * If the port is ONLINE, then it came back. If it was a SCSI
-	 * FCP target. If not, tear down the scsi_target on it.
+	 * target, validate it still is. If not, tear down the
+	 * scsi_target on it.
 	 */
 	if ((rport->port_state == FC_PORTSTATE_ONLINE) &&
+	    (rport->scsi_target_id != -1) &&
 	    !(rport->roles & FC_RPORT_ROLE_FCP_TARGET)) {
 		dev_printk(KERN_ERR, &rport->dev,
 			"blocked FC remote port time out: no longer"
@@ -2297,18 +2327,24 @@ fc_timeout_deleted_rport(struct work_struct *work)
 		return;
 	}
+	/* NOOP state - we're flushing workq's */
 	if (rport->port_state != FC_PORTSTATE_BLOCKED) {
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		dev_printk(KERN_ERR, &rport->dev,
-			"blocked FC remote port time out: leaving target alone\n");
+			"blocked FC remote port time out: leaving"
+			" rport%s alone\n",
+			(rport->scsi_target_id != -1) ?  " and starget" : "");
 		return;
 	}
-	if (fc_host->tgtid_bind_type == FC_TGTID_BIND_NONE) {
+	if ((fc_host->tgtid_bind_type == FC_TGTID_BIND_NONE) ||
+	    (rport->scsi_target_id == -1)) {
 		list_del(&rport->peers);
 		rport->port_state = FC_PORTSTATE_DELETED;
 		dev_printk(KERN_ERR, &rport->dev,
-			"blocked FC remote port time out: removing target\n");
+			"blocked FC remote port time out: removing"
+			" rport%s\n",
+			(rport->scsi_target_id != -1) ?  " and starget" : "");
 		fc_queue_work(shost, &rport->rport_delete_work);
 		spin_unlock_irqrestore(shost->host_lock, flags);
 		return;