Commit 0dba89dc authored by James Bottomley's avatar James Bottomley

Merge raven.il.steeleye.com:/home/jejb/BK/scsi-error-2.5

into raven.il.steeleye.com:/home/jejb/BK/scsi-error-new-2.5
parents 3b6244e1 0166205d
...@@ -397,6 +397,7 @@ struct Scsi_Host * scsi_register(Scsi_Host_Template *shost_tp, int xtr_bytes) ...@@ -397,6 +397,7 @@ struct Scsi_Host * scsi_register(Scsi_Host_Template *shost_tp, int xtr_bytes)
spin_lock_init(&shost->default_lock); spin_lock_init(&shost->default_lock);
scsi_assign_lock(shost, &shost->default_lock); scsi_assign_lock(shost, &shost->default_lock);
INIT_LIST_HEAD(&shost->my_devices); INIT_LIST_HEAD(&shost->my_devices);
INIT_LIST_HEAD(&shost->eh_cmd_list);
init_waitqueue_head(&shost->host_wait); init_waitqueue_head(&shost->host_wait);
shost->dma_channel = 0xff; shost->dma_channel = 0xff;
...@@ -641,18 +642,3 @@ void scsi_host_busy_dec_and_test(struct Scsi_Host *shost, Scsi_Device *sdev) ...@@ -641,18 +642,3 @@ void scsi_host_busy_dec_and_test(struct Scsi_Host *shost, Scsi_Device *sdev)
} }
spin_unlock_irqrestore(shost->host_lock, flags); spin_unlock_irqrestore(shost->host_lock, flags);
} }
void scsi_host_failed_inc_and_test(struct Scsi_Host *shost)
{
unsigned long flags;
spin_lock_irqsave(shost->host_lock, flags);
shost->in_recovery = 1;
shost->host_failed++;
if (shost->host_busy == shost->host_failed) {
up(shost->eh_wait);
SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
" thread\n"));
}
spin_unlock_irqrestore(shost->host_lock, flags);
}
...@@ -384,6 +384,7 @@ struct Scsi_Host ...@@ -384,6 +384,7 @@ struct Scsi_Host
spinlock_t default_lock; spinlock_t default_lock;
spinlock_t *host_lock; spinlock_t *host_lock;
struct list_head eh_cmd_list;
struct task_struct * ehandler; /* Error recovery thread. */ struct task_struct * ehandler; /* Error recovery thread. */
struct semaphore * eh_wait; /* The error recovery thread waits on struct semaphore * eh_wait; /* The error recovery thread waits on
this. */ this. */
...@@ -587,7 +588,6 @@ extern void scsi_host_init(void); ...@@ -587,7 +588,6 @@ extern void scsi_host_init(void);
*/ */
extern void scsi_host_busy_inc(struct Scsi_Host *, Scsi_Device *); extern void scsi_host_busy_inc(struct Scsi_Host *, Scsi_Device *);
extern void scsi_host_busy_dec_and_test(struct Scsi_Host *, Scsi_Device *); extern void scsi_host_busy_dec_and_test(struct Scsi_Host *, Scsi_Device *);
extern void scsi_host_failed_inc_and_test(struct Scsi_Host *);
/** /**
* scsi_find_device - find a device given the host * scsi_find_device - find a device given the host
......
...@@ -790,13 +790,9 @@ static void scsi_softirq(struct softirq_action *h) ...@@ -790,13 +790,9 @@ static void scsi_softirq(struct softirq_action *h)
if ((status_byte(SCpnt->result) & CHECK_CONDITION) != 0) { if ((status_byte(SCpnt->result) & CHECK_CONDITION) != 0) {
SCSI_LOG_MLCOMPLETE(3, print_sense("bh", SCpnt)); SCSI_LOG_MLCOMPLETE(3, print_sense("bh", SCpnt));
} }
if (SCpnt->device->host->eh_wait != NULL) {
scsi_eh_eflags_set(SCpnt, SCSI_EH_CMD_FAILED | SCSI_EH_CMD_ERR);
SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
SCpnt->state = SCSI_STATE_FAILED;
scsi_host_failed_inc_and_test(SCpnt->device->host); if (!scsi_eh_scmd_add(SCpnt, 0))
} else { {
/* /*
* We only get here if the error * We only get here if the error
* recovery thread has died. * recovery thread has died.
...@@ -1298,6 +1294,44 @@ void scsi_device_put(struct scsi_device *sdev) ...@@ -1298,6 +1294,44 @@ void scsi_device_put(struct scsi_device *sdev)
module_put(sdev->host->hostt->module); module_put(sdev->host->hostt->module);
} }
/**
* scsi_set_device_offline - set scsi_device offline
* @sdev: pointer to struct scsi_device to offline.
*
* Locks: host_lock held on entry.
**/
void scsi_set_device_offline(struct scsi_device *sdev)
{
struct scsi_cmnd *scmd;
int cmds_active = 0;
unsigned long flags;
sdev->online = FALSE;
spin_lock_irqsave(&sdev->list_lock, flags);
list_for_each_entry(scmd, &sdev->cmd_list, list) {
if (scmd->request && scmd->request->rq_status != RQ_INACTIVE) {
/*
* If we are unable to remove the timer, it means
* that the command has already timed out or
* finished.
*/
if (!scsi_delete_timer(scmd)) {
continue;
}
++cmds_active;
scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD);
}
}
spin_unlock_irqrestore(&sdev->list_lock, flags);
if (!cmds_active) {
/* FIXME: Send online state change hotplug event */
}
}
/* /*
* Function: scsi_slave_attach() * Function: scsi_slave_attach()
* *
......
...@@ -455,6 +455,7 @@ extern int scsi_slave_attach(struct scsi_device *); ...@@ -455,6 +455,7 @@ extern int scsi_slave_attach(struct scsi_device *);
extern void scsi_slave_detach(struct scsi_device *); extern void scsi_slave_detach(struct scsi_device *);
extern int scsi_device_get(struct scsi_device *); extern int scsi_device_get(struct scsi_device *);
extern void scsi_device_put(struct scsi_device *); extern void scsi_device_put(struct scsi_device *);
extern void scsi_set_device_offline(struct scsi_device *);
extern void scsi_done(Scsi_Cmnd * SCpnt); extern void scsi_done(Scsi_Cmnd * SCpnt);
extern void scsi_finish_command(Scsi_Cmnd *); extern void scsi_finish_command(Scsi_Cmnd *);
extern int scsi_retry_command(Scsi_Cmnd *); extern int scsi_retry_command(Scsi_Cmnd *);
...@@ -726,6 +727,7 @@ struct scsi_cmnd { ...@@ -726,6 +727,7 @@ struct scsi_cmnd {
struct list_head list; /* scsi_cmnd participates in queue lists */ struct list_head list; /* scsi_cmnd participates in queue lists */
struct list_head eh_list; /* Used to place us on the host eh list */
int eh_state; /* Used for state tracking in error handlr */ int eh_state; /* Used for state tracking in error handlr */
int eh_eflags; /* Used by error handlr */ int eh_eflags; /* Used by error handlr */
void (*done) (struct scsi_cmnd *); /* Mid-level done function */ void (*done) (struct scsi_cmnd *); /* Mid-level done function */
...@@ -960,13 +962,13 @@ static inline Scsi_Cmnd *scsi_find_tag(Scsi_Device *SDpnt, int tag) { ...@@ -960,13 +962,13 @@ static inline Scsi_Cmnd *scsi_find_tag(Scsi_Device *SDpnt, int tag) {
/* /*
* Scsi Error Handler Flags * Scsi Error Handler Flags
*/ */
#define SCSI_EH_CMD_ERR 0x0001 /* Orig cmd error'd */ #define SCSI_EH_CANCEL_CMD 0x0001 /* Cancel this cmd */
#define SCSI_EH_CMD_FAILED 0x0002 /* Orig cmd error type failed */ #define SCSI_EH_REC_TIMEOUT 0x0002 /* EH retry timed out */
#define SCSI_EH_CMD_TIMEOUT 0x0004 /* Orig cmd error type timeout */
#define SCSI_EH_REC_TIMEOUT 0x0008 /* Recovery cmd timeout */
#define SCSI_SENSE_VALID(scmd) ((scmd->sense_buffer[0] & 0x70) == 0x70) #define SCSI_SENSE_VALID(scmd) ((scmd->sense_buffer[0] & 0x70) == 0x70)
extern int scsi_eh_scmd_add(struct scsi_cmnd *, int);
int scsi_set_medium_removal(Scsi_Device *dev, char state); int scsi_set_medium_removal(Scsi_Device *dev, char state);
extern int scsi_device_register(struct scsi_device *); extern int scsi_device_register(struct scsi_device *);
......
...@@ -55,6 +55,49 @@ ...@@ -55,6 +55,49 @@
#define BUS_RESET_SETTLE_TIME 10*HZ #define BUS_RESET_SETTLE_TIME 10*HZ
#define HOST_RESET_SETTLE_TIME 10*HZ #define HOST_RESET_SETTLE_TIME 10*HZ
/**
* scsi_eh_scmd_add - add scsi cmd to error handling.
* @scmd: scmd to run eh on.
* @eh_flag: optional SCSI_EH flag.
*
* Return value:
* 0 on failure.
**/
int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
{
struct Scsi_Host *shost = scmd->device->host;
unsigned long flags;
if (shost->eh_wait == NULL)
return 0;
spin_lock_irqsave(shost->host_lock, flags);
scsi_eh_eflags_set(scmd, eh_flag);
/*
* FIXME: Can we stop setting owner and state.
*/
scmd->owner = SCSI_OWNER_ERROR_HANDLER;
scmd->state = SCSI_STATE_FAILED;
/*
* Set the serial_number_at_timeout to the current
* serial_number
*/
scmd->serial_number_at_timeout = scmd->serial_number;
list_add_tail(&scmd->eh_list, &shost->eh_cmd_list);
shost->in_recovery = 1;
shost->host_failed++;
if (shost->host_busy == shost->host_failed) {
up(shost->eh_wait);
SCSI_LOG_ERROR_RECOVERY(5, printk("Waking error handler"
" thread\n"));
}
spin_unlock_irqrestore(shost->host_lock, flags);
return 1;
}
/** /**
* scsi_add_timer - Start timeout timer for a single scsi command. * scsi_add_timer - Start timeout timer for a single scsi command.
* @scmd: scsi command that is about to start running. * @scmd: scsi command that is about to start running.
...@@ -131,22 +174,14 @@ int scsi_delete_timer(Scsi_Cmnd *scmd) ...@@ -131,22 +174,14 @@ int scsi_delete_timer(Scsi_Cmnd *scmd)
**/ **/
void scsi_times_out(Scsi_Cmnd *scmd) void scsi_times_out(Scsi_Cmnd *scmd)
{ {
struct Scsi_Host *shost = scmd->device->host; if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) {
/* Set the serial_number_at_timeout to the current serial_number */
scmd->serial_number_at_timeout = scmd->serial_number;
scsi_eh_eflags_set(scmd, SCSI_EH_CMD_TIMEOUT | SCSI_EH_CMD_ERR);
if (unlikely(shost->eh_wait == NULL)) {
panic("Error handler thread not present at %p %p %s %d", panic("Error handler thread not present at %p %p %s %d",
scmd, shost, __FILE__, __LINE__); scmd, scmd->device->host, __FILE__, __LINE__);
} }
scsi_host_failed_inc_and_test(shost);
SCSI_LOG_TIMEOUT(3, printk("Command timed out busy=%d failed=%d\n", SCSI_LOG_TIMEOUT(3, printk("Command timed out busy=%d failed=%d\n",
shost->host_busy, shost->host_failed)); scmd->device->host->host_busy,
scmd->device->host->host_failed));
} }
/** /**
...@@ -176,36 +211,36 @@ int scsi_block_when_processing_errors(Scsi_Device *sdev) ...@@ -176,36 +211,36 @@ int scsi_block_when_processing_errors(Scsi_Device *sdev)
* @sc_list: List for failed cmds. * @sc_list: List for failed cmds.
* @shost: scsi host being recovered. * @shost: scsi host being recovered.
**/ **/
static void scsi_eh_prt_fail_stats(Scsi_Cmnd *sc_list, struct Scsi_Host *shost) static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost)
{ {
Scsi_Cmnd *scmd; struct scsi_cmnd *scmd;
Scsi_Device *sdev; struct scsi_device *sdev;
int total_failures = 0; int total_failures = 0;
int cmd_failed = 0; int cmd_failed = 0;
int cmd_timed_out = 0; int cmd_cancel = 0;
int devices_failed = 0; int devices_failed = 0;
list_for_each_entry(sdev, &shost->my_devices, siblings) { list_for_each_entry(sdev, &shost->my_devices, siblings) {
for (scmd = sc_list; scmd; scmd = scmd->bh_next) { list_for_each_entry(scmd, &shost->eh_cmd_list, eh_list) {
if (scmd->device == sdev) { if (scmd->device == sdev) {
++total_failures; ++total_failures;
if (scsi_eh_eflags_chk(scmd, if (scsi_eh_eflags_chk(scmd,
SCSI_EH_CMD_TIMEOUT)) SCSI_EH_CANCEL_CMD))
++cmd_timed_out; ++cmd_cancel;
else else
++cmd_failed; ++cmd_failed;
} }
} }
if (cmd_timed_out || cmd_failed) { if (cmd_cancel || cmd_failed) {
SCSI_LOG_ERROR_RECOVERY(3, SCSI_LOG_ERROR_RECOVERY(3,
printk("%s: %d:%d:%d:%d cmds failed: %d," printk("%s: %d:%d:%d:%d cmds failed: %d,"
" timedout: %d\n", " cancel: %d\n",
__FUNCTION__, shost->host_no, __FUNCTION__, shost->host_no,
sdev->channel, sdev->id, sdev->lun, sdev->channel, sdev->id, sdev->lun,
cmd_failed, cmd_timed_out)); cmd_failed, cmd_cancel));
cmd_timed_out = 0; cmd_cancel = 0;
cmd_failed = 0; cmd_failed = 0;
++devices_failed; ++devices_failed;
} }
...@@ -217,68 +252,6 @@ static void scsi_eh_prt_fail_stats(Scsi_Cmnd *sc_list, struct Scsi_Host *shost) ...@@ -217,68 +252,6 @@ static void scsi_eh_prt_fail_stats(Scsi_Cmnd *sc_list, struct Scsi_Host *shost)
} }
#endif #endif
/**
* scsi_eh_get_failed - Gather failed cmds.
* @sc_list: A pointer to a list for failed cmds.
* @shost: Scsi host being recovered.
*
* XXX Add opaque interator for device / shost. Investigate direct
* addition to per eh list on error allowing skipping of this step.
**/
static void scsi_eh_get_failed(Scsi_Cmnd **sc_list, struct Scsi_Host *shost)
{
int found;
Scsi_Device *sdev;
Scsi_Cmnd *scmd;
found = 0;
list_for_each_entry(sdev, &shost->my_devices, siblings) {
unsigned long flags;
spin_lock_irqsave(&sdev->list_lock, flags);
list_for_each_entry(scmd, &sdev->cmd_list, list) {
if (scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) {
scmd->bh_next = *sc_list;
*sc_list = scmd;
found++;
} else {
/*
* FIXME Verify how this can happen and if
* this is still needed??
*/
if (scmd->state != SCSI_STATE_INITIALIZING
&& scmd->state != SCSI_STATE_UNUSED) {
/*
* Rats. Something is still floating
* around out there This could be the
* result of the fact that the upper level
* drivers are still frobbing commands
* that might have succeeded. There are
* two outcomes. One is that the command
* block will eventually be freed, and the
* other one is that the command will be
* queued and will be finished along the
* way.
*/
SCSI_LOG_ERROR_RECOVERY(1, printk("Error hdlr"
" prematurely woken"
" cmds still active"
" (%p %x %d)\n",
scmd, scmd->state,
scmd->device->id));
}
}
}
spin_unlock_irqrestore(&sdev->list_lock, flags);
}
SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(*sc_list, shost));
if (shost->host_failed != found)
printk(KERN_ERR "%s: host_failed: %d != found: %d\n",
__FUNCTION__, shost->host_failed, found);
}
/** /**
* scsi_check_sense - Examine scsi cmd sense * scsi_check_sense - Examine scsi cmd sense
* @scmd: Cmd to have sense checked. * @scmd: Cmd to have sense checked.
...@@ -535,7 +508,8 @@ static int scsi_send_eh_cmnd(Scsi_Cmnd *scmd, int timeout) ...@@ -535,7 +508,8 @@ static int scsi_send_eh_cmnd(Scsi_Cmnd *scmd, int timeout)
spin_lock_irqsave(scmd->device->host->host_lock, flags); spin_lock_irqsave(scmd->device->host->host_lock, flags);
if (scmd->device->host->hostt->eh_abort_handler) if (scmd->device->host->hostt->eh_abort_handler)
scmd->device->host->hostt->eh_abort_handler(scmd); scmd->device->host->hostt->eh_abort_handler(scmd);
spin_unlock_irqrestore(scmd->device->host->host_lock, flags); spin_unlock_irqrestore(scmd->device->host->host_lock,
flags);
scmd->request->rq_status = RQ_SCSI_DONE; scmd->request->rq_status = RQ_SCSI_DONE;
scmd->owner = SCSI_OWNER_ERROR_HANDLER; scmd->owner = SCSI_OWNER_ERROR_HANDLER;
...@@ -677,6 +651,7 @@ static int scsi_eh_retry_cmd(Scsi_Cmnd *scmd) ...@@ -677,6 +651,7 @@ static int scsi_eh_retry_cmd(Scsi_Cmnd *scmd)
* scsi_eh_finish_cmd - Handle a cmd that eh is finished with. * scsi_eh_finish_cmd - Handle a cmd that eh is finished with.
* @scmd: Original SCSI cmd that eh has finished. * @scmd: Original SCSI cmd that eh has finished.
* @shost: SCSI host that cmd originally failed on. * @shost: SCSI host that cmd originally failed on.
* @done_list: list_head for processed commands.
* *
* Notes: * Notes:
* We don't want to use the normal command completion while we are are * We don't want to use the normal command completion while we are are
...@@ -685,7 +660,8 @@ static int scsi_eh_retry_cmd(Scsi_Cmnd *scmd) ...@@ -685,7 +660,8 @@ static int scsi_eh_retry_cmd(Scsi_Cmnd *scmd)
* keep a list of pending commands for final completion, and once we * keep a list of pending commands for final completion, and once we
* are ready to leave error handling we handle completion for real. * are ready to leave error handling we handle completion for real.
**/ **/
static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost) static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost,
struct list_head *done_list )
{ {
shost->host_failed--; shost->host_failed--;
scmd->state = SCSI_STATE_BHQUEUE; scmd->state = SCSI_STATE_BHQUEUE;
...@@ -696,12 +672,14 @@ static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost) ...@@ -696,12 +672,14 @@ static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost)
* things. * things.
*/ */
scsi_setup_cmd_retry(scmd); scsi_setup_cmd_retry(scmd);
list_move_tail(&scmd->eh_list, done_list);
} }
/** /**
* scsi_eh_get_sense - Get device sense data. * scsi_eh_get_sense - Get device sense data.
* @sc_todo: list of cmds that have failed.
* @shost: scsi host being recovered. * @shost: scsi host being recovered.
* @done_list: list_head for processed commands.
* *
* Description: * Description:
* See if we need to request sense information. if so, then get it * See if we need to request sense information. if so, then get it
...@@ -719,23 +697,23 @@ static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost) ...@@ -719,23 +697,23 @@ static void scsi_eh_finish_cmd(Scsi_Cmnd *scmd, struct Scsi_Host *shost)
* *
* In 2.5 this capability will be going away. * In 2.5 this capability will be going away.
**/ **/
static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) static int scsi_eh_get_sense(struct Scsi_Host *shost,
struct list_head *done_list)
{ {
int rtn; int rtn;
struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd; Scsi_Cmnd *scmd;
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: checking to see if we need" list_for_each_safe(lh, lh_sf, &shost->eh_cmd_list) {
" to request sense\n", scmd = list_entry(lh, struct scsi_cmnd, eh_list);
__FUNCTION__)); if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD) ||
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_FAILED) ||
SCSI_SENSE_VALID(scmd)) SCSI_SENSE_VALID(scmd))
continue; continue;
SCSI_LOG_ERROR_RECOVERY(2, printk("%s: requesting sense" SCSI_LOG_ERROR_RECOVERY(2, printk("%s: requesting sense"
" for tgt: %d\n", " for id: %d\n",
__FUNCTION__, scmd->device->id)); current->comm,
scmd->device->id));
rtn = scsi_request_sense(scmd); rtn = scsi_request_sense(scmd);
if (rtn != SUCCESS) if (rtn != SUCCESS)
continue; continue;
...@@ -752,7 +730,7 @@ static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) ...@@ -752,7 +730,7 @@ static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
* upper level. * upper level.
*/ */
if (rtn == SUCCESS) if (rtn == SUCCESS)
scsi_eh_finish_cmd(scmd, shost); scsi_eh_finish_cmd(scmd, shost, done_list);
if (rtn != NEEDS_RETRY) if (rtn != NEEDS_RETRY)
continue; continue;
...@@ -771,10 +749,10 @@ static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) ...@@ -771,10 +749,10 @@ static int scsi_eh_get_sense(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
/* /*
* we eventually hand this one back to the top level. * we eventually hand this one back to the top level.
*/ */
scsi_eh_finish_cmd(scmd, shost); scsi_eh_finish_cmd(scmd, shost, done_list);
} }
return shost->host_failed; return list_empty(&shost->eh_cmd_list);
} }
/** /**
...@@ -864,9 +842,9 @@ static int scsi_eh_tur(Scsi_Cmnd *scmd) ...@@ -864,9 +842,9 @@ static int scsi_eh_tur(Scsi_Cmnd *scmd)
} }
/** /**
* scsi_eh_abort_cmd - abort a timed-out cmd. * scsi_eh_abort_cmds - abort canceled commands.
* @sc_todo: A list of cmds that have failed.
* @shost: scsi host being recovered. * @shost: scsi host being recovered.
* @done_list: list_head for processed commands.
* *
* Decription: * Decription:
* Try and see whether or not it makes sense to try and abort the * Try and see whether or not it makes sense to try and abort the
...@@ -875,29 +853,36 @@ static int scsi_eh_tur(Scsi_Cmnd *scmd) ...@@ -875,29 +853,36 @@ static int scsi_eh_tur(Scsi_Cmnd *scmd)
* no sense to try and abort the command, since as far as the shost * no sense to try and abort the command, since as far as the shost
* adapter is concerned, it isn't running. * adapter is concerned, it isn't running.
**/ **/
static int scsi_eh_abort_cmd(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) static int scsi_eh_abort_cmds(struct Scsi_Host *shost,
struct list_head *done_list)
{ {
int rtn; int rtn;
Scsi_Cmnd *scmd; struct list_head *lh, *lh_sf;
struct scsi_cmnd *scmd;
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: checking to see if we need" list_for_each_safe(lh, lh_sf, &shost->eh_cmd_list) {
" to abort cmd\n", __FUNCTION__)); scmd = list_entry(lh, struct scsi_cmnd, eh_list);
if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD))
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) {
if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_TIMEOUT))
continue; continue;
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting cmd:"
"0x%p\n", current->comm,
scmd));
rtn = scsi_try_to_abort_cmd(scmd); rtn = scsi_try_to_abort_cmd(scmd);
if (rtn == SUCCESS) { if (rtn == SUCCESS) {
if (!scsi_eh_tur(scmd)) { scsi_eh_eflags_clr(scmd, SCSI_EH_CANCEL_CMD);
rtn = scsi_eh_retry_cmd(scmd); if (!scmd->device->online || !scsi_eh_tur(scmd)) {
if (rtn == SUCCESS) scsi_eh_finish_cmd(scmd, shost, done_list);
scsi_eh_finish_cmd(scmd, shost);
}
} }
} else
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: aborting"
" cmd failed:"
"0x%p\n",
current->comm,
scmd));
} }
return shost->host_failed;
return list_empty(&shost->eh_cmd_list);
} }
/** /**
...@@ -933,9 +918,9 @@ static int scsi_try_bus_device_reset(Scsi_Cmnd *scmd) ...@@ -933,9 +918,9 @@ static int scsi_try_bus_device_reset(Scsi_Cmnd *scmd)
} }
/** /**
* scsi_eh_bus_device_reset - send bdr is needed * scsi_eh_bus_device_reset - send bdr if needed
* @sc_todo: a list of cmds that have failed.
* @shost: scsi host being recovered. * @shost: scsi host being recovered.
* @done_list: list_head for processed commands.
* *
* Notes: * Notes:
* Try a bus device reset. still, look to see whether we have multiple * Try a bus device reset. still, look to see whether we have multiple
...@@ -943,39 +928,52 @@ static int scsi_try_bus_device_reset(Scsi_Cmnd *scmd) ...@@ -943,39 +928,52 @@ static int scsi_try_bus_device_reset(Scsi_Cmnd *scmd)
* makes no sense to try bus_device_reset - we really would need to try * makes no sense to try bus_device_reset - we really would need to try
* a bus_reset instead. * a bus_reset instead.
**/ **/
static int scsi_eh_bus_device_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
struct list_head *done_list)
{ {
int rtn; int rtn;
Scsi_Cmnd *scmd; struct list_head *lh, *lh_sf;
Scsi_Device *sdev; struct scsi_cmnd *scmd, *bdr_scmd;
struct scsi_device *sdev;
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Trying BDR\n", __FUNCTION__));
list_for_each_entry(sdev, &shost->my_devices, siblings) { list_for_each_entry(sdev, &shost->my_devices, siblings) {
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) bdr_scmd = NULL;
if ((scmd->device == sdev) && list_for_each_entry(scmd, &shost->eh_cmd_list, eh_list)
scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) if (scmd->device == sdev) {
bdr_scmd = scmd;
break; break;
}
if (!scmd) if (!bdr_scmd)
continue; continue;
/* SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BDR sdev:"
* ok, we have a device that is having problems. try and send " 0x%p\n", current->comm,
* a bus device reset to it. sdev));
*/ rtn = scsi_try_bus_device_reset(bdr_scmd);
rtn = scsi_try_bus_device_reset(scmd); if (rtn == SUCCESS) {
if ((rtn == SUCCESS) && (!scsi_eh_tur(scmd))) if (!sdev->online || !scsi_eh_tur(bdr_scmd)) {
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) list_for_each_safe(lh, lh_sf,
if ((scmd->device == sdev) && &shost->eh_cmd_list) {
scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) { scmd = list_entry(lh, struct
rtn = scsi_eh_retry_cmd(scmd); scsi_cmnd,
if (rtn == SUCCESS) eh_list);
scsi_eh_finish_cmd(scmd, shost); if (scmd->device == sdev)
scsi_eh_finish_cmd(scmd,
shost,
done_list);
}
}
} else {
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BDR"
" failed sdev:"
"0x%p\n",
current->comm,
sdev));
} }
} }
return shost->host_failed; return list_empty(&shost->eh_cmd_list);
} }
/** /**
...@@ -1005,7 +1003,8 @@ static int scsi_try_bus_reset(Scsi_Cmnd *scmd) ...@@ -1005,7 +1003,8 @@ static int scsi_try_bus_reset(Scsi_Cmnd *scmd)
/* /*
* Mark all affected devices to expect a unit attention. * Mark all affected devices to expect a unit attention.
*/ */
list_for_each_entry(sdev, &scmd->device->host->my_devices, siblings) list_for_each_entry(sdev, &scmd->device->host->my_devices,
siblings)
if (scmd->device->channel == sdev->channel) { if (scmd->device->channel == sdev->channel) {
sdev->was_reset = 1; sdev->was_reset = 1;
sdev->expecting_cc_ua = 1; sdev->expecting_cc_ua = 1;
...@@ -1041,7 +1040,8 @@ static int scsi_try_host_reset(Scsi_Cmnd *scmd) ...@@ -1041,7 +1040,8 @@ static int scsi_try_host_reset(Scsi_Cmnd *scmd)
/* /*
* Mark all affected devices to expect a unit attention. * Mark all affected devices to expect a unit attention.
*/ */
list_for_each_entry(sdev, &scmd->device->host->my_devices, siblings) list_for_each_entry(sdev, &scmd->device->host->my_devices,
siblings)
if (scmd->device->channel == sdev->channel) { if (scmd->device->channel == sdev->channel) {
sdev->was_reset = 1; sdev->was_reset = 1;
sdev->expecting_cc_ua = 1; sdev->expecting_cc_ua = 1;
...@@ -1051,25 +1051,19 @@ static int scsi_try_host_reset(Scsi_Cmnd *scmd) ...@@ -1051,25 +1051,19 @@ static int scsi_try_host_reset(Scsi_Cmnd *scmd)
} }
/** /**
* scsi_eh_bus_host_reset - send a bus reset and on failure try host reset * scsi_eh_bus_reset - send a bus reset
* @sc_todo: a list of cmds that have failed.
* @shost: scsi host being recovered. * @shost: scsi host being recovered.
* @done_list: list_head for processed commands.
**/ **/
static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) static int scsi_eh_bus_reset(struct Scsi_Host *shost,
struct list_head *done_list)
{ {
int rtn; int rtn;
struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd; Scsi_Cmnd *scmd;
Scsi_Cmnd *chan_scmd; Scsi_Cmnd *chan_scmd;
unsigned int channel; unsigned int channel;
/*
* if we ended up here, we have serious problems. the only thing left
* to try is a full bus reset. if someone has grabbed the bus and isn't
* letting go, then perhaps this will help.
*/
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Try Bus/Host RST\n",
__FUNCTION__));
/* /*
* we really want to loop over the various channels, and do this on * we really want to loop over the various channels, and do this on
* a channel by channel basis. we should also check to see if any * a channel by channel basis. we should also check to see if any
...@@ -1078,9 +1072,8 @@ static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) ...@@ -1078,9 +1072,8 @@ static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
*/ */
for (channel = 0; channel <= shost->max_channel; channel++) { for (channel = 0; channel <= shost->max_channel; channel++) {
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) { chan_scmd = NULL;
if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) list_for_each_entry(scmd, &shost->eh_cmd_list, eh_list) {
continue;
if (channel == scmd->device->channel) { if (channel == scmd->device->channel) {
chan_scmd = scmd; chan_scmd = scmd;
break; break;
...@@ -1091,63 +1084,97 @@ static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) ...@@ -1091,63 +1084,97 @@ static int scsi_eh_bus_host_reset(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost)
} }
} }
if (!scmd) if (!chan_scmd)
continue; continue;
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending BRST chan:"
/* " %d\n", current->comm,
* we now know that we are able to perform a reset for the channel));
* channel that scmd points to. rtn = scsi_try_bus_reset(chan_scmd);
*/
rtn = scsi_try_bus_reset(scmd);
if (rtn != SUCCESS)
rtn = scsi_try_host_reset(scmd);
if (rtn == SUCCESS) { if (rtn == SUCCESS) {
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) { list_for_each_safe(lh, lh_sf, &shost->eh_cmd_list) {
if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR) scmd = list_entry(lh, struct scsi_cmnd,
|| channel != scmd->device->channel) eh_list);
continue; if (channel == scmd->device->channel)
if (!scsi_eh_tur(scmd)) { if (!scmd->device->online ||
rtn = scsi_eh_retry_cmd(scmd); !scsi_eh_tur(scmd))
scsi_eh_finish_cmd(scmd,
if (rtn == SUCCESS) shost,
scsi_eh_finish_cmd(scmd, shost); done_list);
} }
} else {
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: BRST"
" failed chan: %d\n",
current->comm,
channel));
} }
} }
return list_empty(&shost->eh_cmd_list);
}
/**
* scsi_eh_host_reset - send a host reset
* @shost: scsi host being recovered.
* @done_list: list_head for processed commands.
**/
static int scsi_eh_host_reset(struct Scsi_Host *shost,
struct list_head *done_list)
{
int rtn;
struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd;
if (!list_empty(&shost->eh_cmd_list)) {
scmd = list_entry(shost->eh_cmd_list.next,
struct scsi_cmnd, eh_list);
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: Sending HRST\n"
, current->comm));
rtn = scsi_try_host_reset(scmd);
if (rtn == SUCCESS) {
list_for_each_safe(lh, lh_sf, &shost->eh_cmd_list) {
scmd = list_entry(lh, struct scsi_cmnd, eh_list);
if (!scmd->device->online || !scsi_eh_tur(scmd))
scsi_eh_finish_cmd(scmd, shost,
done_list);
} }
return shost->host_failed; } else {
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: HRST"
" failed\n",
current->comm));
}
}
return list_empty(&shost->eh_cmd_list);
} }
/** /**
* scsi_eh_offline_sdevs - offline scsi devices that fail to recover * scsi_eh_offline_sdevs - offline scsi devices that fail to recover
* @sc_todo: a list of cmds that have failed.
* @shost: scsi host being recovered. * @shost: scsi host being recovered.
* @done_list: list_head for processed commands.
* *
**/ **/
static void scsi_eh_offline_sdevs(Scsi_Cmnd *sc_todo, struct Scsi_Host *shost) static void scsi_eh_offline_sdevs(struct Scsi_Host *shost,
struct list_head *done_list)
{ {
struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd; Scsi_Cmnd *scmd;
for (scmd = sc_todo; scmd; scmd = scmd->bh_next) { list_for_each_safe(lh, lh_sf, &shost->eh_cmd_list) {
if (!scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_ERR)) scmd = list_entry(lh, struct scsi_cmnd, eh_list);
continue;
printk(KERN_INFO "scsi: Device offlined - not" printk(KERN_INFO "scsi: Device offlined - not"
" ready or command retry failed" " ready after error recovery: host"
" after error recovery: host"
" %d channel %d id %d lun %d\n", " %d channel %d id %d lun %d\n",
shost->host_no, shost->host_no,
scmd->device->channel, scmd->device->channel,
scmd->device->id, scmd->device->id,
scmd->device->lun); scmd->device->lun);
scmd->device->online = FALSE;
if (scsi_eh_eflags_chk(scmd, SCSI_EH_CMD_TIMEOUT)) if (scsi_eh_eflags_chk(scmd, SCSI_EH_CANCEL_CMD)) {
scmd->result |= (DRIVER_TIMEOUT << 24); /*
* FIXME: Handle lost cmds.
scmd->device->online = 0; */
scsi_eh_finish_cmd(scmd, shost); }
scsi_eh_finish_cmd(scmd, shost, done_list);
} }
return; return;
} }
...@@ -1442,6 +1469,8 @@ static void scsi_restart_operations(struct Scsi_Host *shost) ...@@ -1442,6 +1469,8 @@ static void scsi_restart_operations(struct Scsi_Host *shost)
ASSERT_LOCK(shost->host_lock, 0); ASSERT_LOCK(shost->host_lock, 0);
shost->in_recovery = 0;
/* /*
* If the door was locked, we need to insert a door lock request * If the door was locked, we need to insert a door lock request
* onto the head of the SCSI request queue for the device. There * onto the head of the SCSI request queue for the device. There
...@@ -1481,6 +1510,56 @@ static void scsi_restart_operations(struct Scsi_Host *shost) ...@@ -1481,6 +1510,56 @@ static void scsi_restart_operations(struct Scsi_Host *shost)
spin_unlock_irqrestore(shost->host_lock, flags); spin_unlock_irqrestore(shost->host_lock, flags);
} }
/**
* scsi_eh_ready_devs - check device ready state and recover if not.
* @shost: host to be recovered.
* @done_list: list_head for processed commands.
*
**/
static void scsi_eh_ready_devs(struct Scsi_Host *shost,
struct list_head *done_list)
{
if (scsi_eh_bus_device_reset(shost, done_list))
if (scsi_eh_bus_reset(shost, done_list))
if (scsi_eh_host_reset(shost, done_list))
scsi_eh_offline_sdevs(shost, done_list);
}
/**
* scsi_eh_flush_done_list - finish processed commands or retry them.
* @shost: host to be recovered.
* @done_list: list_head of processed commands.
*
**/
static void scsi_eh_flush_done_list(struct Scsi_Host *shost,
struct list_head *done_list)
{
struct list_head *lh, *lh_sf;
Scsi_Cmnd *scmd;
list_for_each_safe(lh, lh_sf, done_list) {
scmd = list_entry(lh, struct scsi_cmnd, eh_list);
list_del_init(lh);
if (!scmd->device->online) {
scmd->result |= (DRIVER_TIMEOUT << 24);
} else {
if (++scmd->retries < scmd->allowed) {
SCSI_LOG_ERROR_RECOVERY(3,
printk("%s: flush retry"
" cmd: %p\n",
current->comm,
scmd));
scsi_retry_command(scmd);
continue;
}
}
SCSI_LOG_ERROR_RECOVERY(3, printk("%s: flush finish"
" cmd: %p\n",
current->comm, scmd));
scsi_finish_command(scmd);
}
}
/** /**
* scsi_unjam_host - Attempt to fix a host which has a cmd that failed. * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
* @shost: Host to unjam. * @shost: Host to unjam.
...@@ -1506,60 +1585,15 @@ static void scsi_restart_operations(struct Scsi_Host *shost) ...@@ -1506,60 +1585,15 @@ static void scsi_restart_operations(struct Scsi_Host *shost)
**/ **/
static void scsi_unjam_host(struct Scsi_Host *shost) static void scsi_unjam_host(struct Scsi_Host *shost)
{ {
Scsi_Cmnd *sc_todo = NULL; LIST_HEAD(done_list);
Scsi_Cmnd *scmd;
/*
* Is this assert really ok anymore (andmike). Should we at least
* be using spin_lock_unlocked.
*/
ASSERT_LOCK(shost->host_lock, 0);
scsi_eh_get_failed(&sc_todo, shost);
if (scsi_eh_get_sense(sc_todo, shost))
if (scsi_eh_abort_cmd(sc_todo, shost))
if (scsi_eh_bus_device_reset(sc_todo, shost))
if (scsi_eh_bus_host_reset(sc_todo, shost))
scsi_eh_offline_sdevs(sc_todo, shost);
BUG_ON(shost->host_failed); SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost));
if (!scsi_eh_get_sense(shost, &done_list))
if (!scsi_eh_abort_cmds(shost, &done_list))
scsi_eh_ready_devs(shost, &done_list);
/* scsi_eh_flush_done_list(shost, &done_list);
* We are currently holding these things in a linked list - we
* didn't put them in the bottom half queue because we wanted to
* keep things quiet while we were working on recovery, and
* passing them up to the top level could easily cause the top
* level to try and queue something else again.
*
* start by marking that the host is no longer in error recovery.
*/
shost->in_recovery = 0;
/*
* take the list of commands, and stick them in the bottom half queue.
* the current implementation of scsi_done will do this for us - if need
* be we can create a special version of this function to do the
* same job for us.
*/
for (scmd = sc_todo; scmd; scmd = sc_todo) {
sc_todo = scmd->bh_next;
scmd->bh_next = NULL;
/*
* Oh, this is a vile hack. scsi_done() expects a timer
* to be running on the command. If there isn't, it assumes
* that the command has actually timed out, and a timer
* handler is running. That may well be how we got into
* this fix, but right now things are stable. We add
* a timer back again so that we can report completion.
* scsi_done() will immediately remove said timer from
* the command, and then process it.
*/
scsi_add_timer(scmd, 100, scsi_eh_times_out);
scsi_done(scmd);
}
} }
/** /**
...@@ -1597,7 +1631,8 @@ void scsi_error_handler(void *data) ...@@ -1597,7 +1631,8 @@ void scsi_error_handler(void *data)
/* /*
* Wake up the thread that created us. * Wake up the thread that created us.
*/ */
SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of scsi_eh_%d\n",shost->host_no)); SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
" scsi_eh_%d\n",shost->host_no));
complete(shost->eh_notify); complete(shost->eh_notify);
...@@ -1607,7 +1642,9 @@ void scsi_error_handler(void *data) ...@@ -1607,7 +1642,9 @@ void scsi_error_handler(void *data)
* away and die. This typically happens if the user is * away and die. This typically happens if the user is
* trying to unload a module. * trying to unload a module.
*/ */
SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d sleeping\n",shost->host_no)); SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
" scsi_eh_%d"
" sleeping\n",shost->host_no));
/* /*
* Note - we always use down_interruptible with the semaphore * Note - we always use down_interruptible with the semaphore
...@@ -1622,7 +1659,9 @@ void scsi_error_handler(void *data) ...@@ -1622,7 +1659,9 @@ void scsi_error_handler(void *data)
if (shost->eh_kill) if (shost->eh_kill)
break; break;
SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d waking up\n",shost->host_no)); SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
" scsi_eh_%d waking"
" up\n",shost->host_no));
shost->eh_active = 1; shost->eh_active = 1;
...@@ -1650,7 +1689,8 @@ void scsi_error_handler(void *data) ...@@ -1650,7 +1689,8 @@ void scsi_error_handler(void *data)
} }
SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d exiting\n",shost->host_no)); SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
" exiting\n",shost->host_no));
/* /*
* Make sure that nobody tries to wake us up again. * Make sure that nobody tries to wake us up again.
......
...@@ -80,6 +80,7 @@ EXPORT_SYMBOL(scsi_slave_attach); ...@@ -80,6 +80,7 @@ EXPORT_SYMBOL(scsi_slave_attach);
EXPORT_SYMBOL(scsi_slave_detach); EXPORT_SYMBOL(scsi_slave_detach);
EXPORT_SYMBOL(scsi_device_get); EXPORT_SYMBOL(scsi_device_get);
EXPORT_SYMBOL(scsi_device_put); EXPORT_SYMBOL(scsi_device_put);
EXPORT_SYMBOL(scsi_set_device_offline);
/* /*
* This symbol is for the highlevel drivers (e.g. sg) only. * This symbol is for the highlevel drivers (e.g. sg) only.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment