Change signal used to exit scsi error handlers

I earlier reported, that the error handler for ide-scsi exits prematurely if modprobed from rc.sysinit. I put in some debug prints to apprehend the culprit responsible for sending the SIGHUP signal that causes the exit. This is what my log captured: Jan 1 12:20:13 fallguy kernel: Process 223 [modprobe] starting scsi error handler Jan 1 12:20:13 fallguy kernel: Wake up parent of scsi_eh_2, pid 224 Jan 1 12:20:13 fallguy kernel: Signals pending for scsi_eh_2: 00000000 00000000 Jan 1 12:20:13 fallguy kernel: Error handler scsi_eh_2 sleeping Jan 1 12:20:13 fallguy kernel: scsi2 : SCSI host adapter emulation for IDE ATAPI devices [detected devices skipped] Jan 1 12:20:14 fallguy kernel: Signal 15 sent from 181 [rc.sysinit] to 182 [getkey] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 22 [init] Jan 1 12:20:14 fallguy kernel: Signal 18 sent from 22 [init] to 22 [init] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 22 [init] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 24 [initlog] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 78 [khubd] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 224 [scsi_eh_2] Jan 1 12:20:14 fallguy kernel: Signals pending for scsi_eh_2: 00000001 00000000 Jan 1 12:20:14 fallguy kernel: Error handler scsi_eh_2 exiting Here is a snapshot of some processes made during rc.sysinit: F UID PID PPID PRI NI VSZ RSS WCHAN STAT TTY TIME COMMAND 100 0 1 0 15 0 1332 420 schedu S ? 0:05 init ... 040 0 22 1 16 0 1332 388 wait4 S tty1 0:00 init 000 0 23 22 15 0 4116 1316 wait4 S tty1 0:00 /bin/bash / 040 0 24 23 16 0 2160 1364 schedu S tty1 0:00 /sbin/initl ... Init must have forked to exec bash to exec rc.sysinit which then gets re-executed through initlog. When rc.sysinit ends, the last thing it does is send that TERM signal from sub-process 181 to getkey (process 182) -- the 'Signal 15 ...' line above. As the forked init (process 22) exits, it sends a flurry of signals to all surviving processes created from it. That looks like standard "if I am to die I need to take all my offspring down with me" behavior -- do you agree? Since we want error handlers to survive, IMHO that means that the choice of signal for error handler exit is unfortunate. The source of scsi_error suggests SIGPWR might be a worthy alternative. I think that is true. From inspecting init source, it is not capable of sending SIGPWR. SIGPWR should never be sent by dying processes (its sole use should be from a power daemon _to_ init to shut the system down when the juice is running out). So I suggest the following changes to hosts.c and scsi_error.c:

Change signal used to exit scsi error handlers
I earlier reported, that the error handler for ide-scsi exits prematurely if modprobed from rc.sysinit. I put in some debug prints to apprehend the culprit responsible for sending the SIGHUP signal that causes the exit. This is what my log captured: Jan 1 12:20:13 fallguy kernel: Process 223 [modprobe] starting scsi error handler Jan 1 12:20:13 fallguy kernel: Wake up parent of scsi_eh_2, pid 224 Jan 1 12:20:13 fallguy kernel: Signals pending for scsi_eh_2: 00000000 00000000 Jan 1 12:20:13 fallguy kernel: Error handler scsi_eh_2 sleeping Jan 1 12:20:13 fallguy kernel: scsi2 : SCSI host adapter emulation for IDE ATAPI devices [detected devices skipped] Jan 1 12:20:14 fallguy kernel: Signal 15 sent from 181 [rc.sysinit] to 182 [getkey] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 22 [init] Jan 1 12:20:14 fallguy kernel: Signal 18 sent from 22 [init] to 22 [init] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 22 [init] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 24 [initlog] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 78 [khubd] Jan 1 12:20:14 fallguy kernel: Signal 1 sent from 22 [init] to 224 [scsi_eh_2] Jan 1 12:20:14 fallguy kernel: Signals pending for scsi_eh_2: 00000001 00000000 Jan 1 12:20:14 fallguy kernel: Error handler scsi_eh_2 exiting Here is a snapshot of some processes made during rc.sysinit: F UID PID PPID PRI NI VSZ RSS WCHAN STAT TTY TIME COMMAND 100 0 1 0 15 0 1332 420 schedu S ? 0:05 init ... 040 0 22 1 16 0 1332 388 wait4 S tty1 0:00 init 000 0 23 22 15 0 4116 1316 wait4 S tty1 0:00 /bin/bash / 040 0 24 23 16 0 2160 1364 schedu S tty1 0:00 /sbin/initl ... Init must have forked to exec bash to exec rc.sysinit which then gets re-executed through initlog. When rc.sysinit ends, the last thing it does is send that TERM signal from sub-process 181 to getkey (process 182) -- the 'Signal 15 ...' line above. As the forked init (process 22) exits, it sends a flurry of signals to all surviving processes created from it. That looks like standard "if I am to die I need to take all my offspring down with me" behavior -- do you agree? Since we want error handlers to survive, IMHO that means that the choice of signal for error handler exit is unfortunate. The source of scsi_error suggests SIGPWR might be a worthy alternative. I think that is true. From inspecting init source, it is not capable of sending SIGPWR. SIGPWR should never be sent by dying processes (its sole use should be from a power daemon _to_ init to shut the system down when the juice is running out). So I suggest the following changes to hosts.c and scsi_error.c:
e18106d2 · Willem Riede · James Bottomley · f01f16c6 · e18106d2 · e18106d2
Commit e18106d2 authored Jan 10, 2003 by Willem Riede Committed by James Bottomley Jan 10, 2003
Show whitespace changes
Inline Side-by-side

Showing with 10 additions and 6 deletions

drivers/scsi/hosts.c drivers/scsi/hosts.c +1 -1

drivers/scsi/scsi_error.c drivers/scsi/scsi_error.c +9 -5

No files found.
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -337,7 +337,7 @@ void scsi_unregister(struct Scsi_Host *shost)
 	if (shost->ehandler) {
 		DECLARE_MUTEX_LOCKED(sem);
 		shost->eh_notify = &sem;
-		send_sig(SIGHUP, shost->ehandler, 1);
+		send_sig(SIGPWR, shost->ehandler, 1);
 		down(&sem);
 		shost->eh_notify = NULL;
 	}

--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -52,8 +52,12 @@
 * go to single-user mode.  For that matter, init also sends SIGKILL,
 * so we mustn't enable that one either.  We use SIGHUP instead.  Other
 * options would be SIGPWR, I suppose.
+ *
+ * Changed behavior 1/1/2003 - it turns out, that SIGHUP can get sent
+ * to error handlers from a process responsible for their creation.
+ * To sidestep that issue, we now use SIGPWR as suggested above.
 */
-#define SHUTDOWN_SIGS	(sigmask(SIGHUP))
+#define SHUTDOWN_SIGS	(sigmask(SIGPWR))

 #ifdef DEBUG
 #define SENSE_TIMEOUT SCSI_TIMEOUT
@@ -1618,7 +1622,7 @@ void scsi_error_handler(void *data)
 	/*
 	 * Wake up the thread that created us.
 	 */
-	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent \n"));
+	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of scsi_eh_%d\n",shost->host_no));

 	up(shost->eh_notify);

@@ -1628,7 +1632,7 @@ void scsi_error_handler(void *data)
 		 * away and die.  This typically happens if the user is
 		 * trying to unload a module.
 		 */
-		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
+		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d sleeping\n",shost->host_no));

 		/*
 		 * Note - we always use down_interruptible with the semaphore
@@ -1643,7 +1647,7 @@ void scsi_error_handler(void *data)
 		if (signal_pending(current))
 			break;

-		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
+		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d waking up\n",shost->host_no));

 		shost->eh_active = 1;

@@ -1671,7 +1675,7 @@ void scsi_error_handler(void *data)

 	}

-	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
+	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d exiting\n",shost->host_no));

 	/*
 	 * Make sure that nobody tries to wake us up again.