[PATCH] ppc64: Fix SLB reload bug

From: Paul Mackerras <paulus@samba.org> Recently we found a particularly nasty bug in the segment handling in the ppc64 kernel. It would only happen rarely under heavy load, but when it did the machine would lock up with the whole of memory filled with exception stack frames. The primary cause was that we were losing the translation for the kernel stack from the SLB, but we still had it in the ERAT for a while longer. Now, there is a critical region in various exception exit paths where we have loaded the SRR0 and SRR1 registers from GPRs and we are loading those GPRs and the stack pointer from the exception frame on the kernel stack. If we lose the ERAT entry for the kernel stack in that region, we take an SLB miss on the next access to the kernel stack. Taking the exception overwrites the values we have put into SRR0 and SRR1, which means we lose state. In fact we ended up repeating that last section of the exception exit path, but using the user stack pointer this time. That caused another exception (or if it didn't, we loaded a new value from the user stack and then went around and tried to use that). And it spiralled downwards from there. The patch below fixes the primary problem by making sure that we really never cast out the SLB entry for the kernel stack. It also improves debuggability in case anything like this happens again by: - In our exception exit paths, we now check whether the RI bit in the SRR1 value is 0. We already set the RI bit to 0 before starting the critical region, but we never checked it. Now, if we do ever get an exception in one of the critical regions, we will detect it before returning to the critical region, and instead we will print a nasty message and oops. - In the exception entry code, we now check that the kernel stack pointer value we're about to use isn't a userspace address. If it is, we print a nasty message and oops. This has been tested on G5 and pSeries (both with and without hypervisor) and compile-tested on iSeries.

[PATCH] ppc64: Fix SLB reload bug
From: Paul Mackerras <paulus@samba.org> Recently we found a particularly nasty bug in the segment handling in the ppc64 kernel. It would only happen rarely under heavy load, but when it did the machine would lock up with the whole of memory filled with exception stack frames. The primary cause was that we were losing the translation for the kernel stack from the SLB, but we still had it in the ERAT for a while longer. Now, there is a critical region in various exception exit paths where we have loaded the SRR0 and SRR1 registers from GPRs and we are loading those GPRs and the stack pointer from the exception frame on the kernel stack. If we lose the ERAT entry for the kernel stack in that region, we take an SLB miss on the next access to the kernel stack. Taking the exception overwrites the values we have put into SRR0 and SRR1, which means we lose state. In fact we ended up repeating that last section of the exception exit path, but using the user stack pointer this time. That caused another exception (or if it didn't, we loaded a new value from the user stack and then went around and tried to use that). And it spiralled downwards from there. The patch below fixes the primary problem by making sure that we really never cast out the SLB entry for the kernel stack. It also improves debuggability in case anything like this happens again by: - In our exception exit paths, we now check whether the RI bit in the SRR1 value is 0. We already set the RI bit to 0 before starting the critical region, but we never checked it. Now, if we do ever get an exception in one of the critical regions, we will detect it before returning to the critical region, and instead we will print a nasty message and oops. - In the exception entry code, we now check that the kernel stack pointer value we're about to use isn't a userspace address. If it is, we print a nasty message and oops. This has been tested on G5 and pSeries (both with and without hypervisor) and compile-tested on iSeries.
f75bd853 · Andrew Morton · Linus Torvalds · 973a4e70 · f75bd853 · f75bd853
Commit f75bd853 authored Mar 18, 2004 by Andrew Morton Committed by Linus Torvalds Mar 18, 2004
4 changed files
--- a/arch/ppc64/kernel/entry.S
+++ b/arch/ppc64/kernel/entry.S
@@ -292,7 +292,24 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	addi	r6,r4,-THREAD	/* Convert THREAD to 'current' */
 	std	r6,PACACURRENT(r13)	/* Set new 'current' */

-	ld	r1,KSP(r4)	/* Load new stack pointer */
+	ld	r8,KSP(r4)	/* new stack pointer */
+BEGIN_FTR_SECTION
+	clrrdi	r6,r8,28	/* get its ESID */
+	clrrdi	r9,r1,28	/* get current sp ESID */
+	clrldi.	r0,r6,2		/* is new ESID c00000000? */
+	cmpd	cr1,r6,r9	/* or is new ESID the same as current ESID? */
+	cror	eq,4*cr1+eq,eq
+	beq	2f		/* if yes, don't slbie it */
+	oris	r6,r6,0x0800	/* set C (class) bit */
+	slbie	r6
+2:
+END_FTR_SECTION_IFSET(CPU_FTR_SLB)
+	clrrdi	r7,r8,THREAD_SHIFT	/* base of new stack */
+	addi	r7,r7,THREAD_SIZE-INT_FRAME_SIZE
+
+	mr	r1,r8		/* start using new stack pointer */
+	std	r7,PACAKSAVE(r13)
+
 	ld	r6,_CCR(r1)
 	mtcrf	0xFF,r6

@@ -357,7 +374,6 @@ _GLOBAL(ret_from_except)
 	addi	r0,r1,INT_FRAME_SIZE	/* size of frame */
 	ld	r4,PACACURRENT(r13)
 	std	r0,THREAD+KSP(r4)	/* save kernel stack pointer */
-	std	r1,PACAKSAVE(r13)	/* save exception stack pointer */

 	/*
 	 * r13 is our per cpu area, only restore it if we are returning to
@@ -388,6 +404,10 @@ restore:
 4:	stb	r5,PACAPROCENABLED(r4)
 #endif

+	ld	r3,_MSR(r1)
+	andi.	r3,r3,MSR_RI
+	beq-	unrecov_restore
+
 	ld	r3,_CTR(r1)
 	ld	r0,_LINK(r1)
 	mtctr	r3
@@ -439,6 +459,11 @@ do_work:
 	bl	.do_signal
 	b	.ret_from_except

+unrecov_restore:
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	unrecov_restore
+
 #ifdef CONFIG_PPC_PSERIES
 /*
 * On CHRP, the Run-Time Abstraction Services (RTAS) have to be

--- a/arch/ppc64/kernel/head.S
+++ b/arch/ppc64/kernel/head.S
@@ -275,14 +275,15 @@ _GLOBAL(__secondary_hold)
 	mfspr	r22,SPRG1;		/* Save r21 in exc. frame      */ \
 	std	r22,EX_R21(r21);	                                  \
 	std     r21,PACAEXCSP(r20);     /* update exception stack ptr  */ \
-		                        /*   iff no protection flt     */ \
 	ld	r22,EX_SRR1(r21);	/* Get SRR1 from exc. frame    */ \
 	andi.   r22,r22,MSR_PR;         /* Set CR for later branch     */ \
 	mr      r22,r1;                 /* Save r1                     */ \
 	subi    r1,r1,INT_FRAME_SIZE;   /* alloc frame on kernel stack */ \
 	beq-    1f;                                                       \
 	ld      r1,PACAKSAVE(r20);      /* kernel stack to use         */ \
-1:      std     r22,GPR1(r1);           /* save r1 in stackframe       */ \
+1:      cmpdi	cr1,r1,0;		/* check if r1 is in userspace */ \
+	bge	cr1,bad_stack;		/* abort if it is	       */ \
+	std     r22,GPR1(r1);           /* save r1 in stackframe       */ \
 	std     r22,0(r1);              /* make stack chain pointer    */ \
 	std     r23,_CCR(r1);           /* save CR in stackframe       */ \
 	ld	r22,EX_R20(r21);	/* move r20 to stackframe      */ \
@@ -610,12 +611,68 @@ __end_stab:
 #else
 	STD_EXCEPTION_COMMON(0x1700, AltivecAssist, .UnknownException )
 #endif
+
+/*
+ * Here the exception frame is filled out and we have detected that
+ * the kernel stack pointer is bad.  R23 contains the saved CR, r20
+ * points to the paca, r21 points to the exception frame, and r22
+ * contains the (bad) kernel stack pointer.
+ * We switch to using the paca guard page as an emergency stack,
+ * save the registers on there, and call kernel_bad_stack(),
+ * which panics.
+ */
+bad_stack:
+	addi	r1,r20,8192-64-INT_FRAME_SIZE
+	std	r22,GPR1(r1)
+	std	r23,_CCR(r1)
+	ld	r22,EX_R20(r21)
+	std	r22,GPR20(r1)
+	ld	r23,EX_R21(r21)
+	std	r23,GPR21(r1)
+	ld	r22,EX_R22(r21)
+	std	r22,GPR22(r1)
+	ld	r23,EX_R23(r21)
+	std	r23,GPR23(r1)
+	ld	r23,EX_DAR(r21)
+	std	r23,_DAR(r1)
+	lwz     r22,EX_DSISR(r21)
+	std	r22,_DSISR(r1)
+	lwz	r23,EX_TRAP(r21)
+	std	r23,TRAP(r1)
+	ld	r22,EX_SRR0(r21)
+	ld	r23,EX_SRR1(r21)
+	std	r22,_NIP(r1)
+	std	r23,_MSR(r1)
+	addi	r21,r21,-EXC_FRAME_SIZE
+	std	r21,PACAEXCSP(r20)
+	mflr	r22
+	std	r22,_LINK(r1)
+	mfctr	r23
+	std	r23,_CTR(r1)
+	mfspr	r22,XER
+	std	r22,_XER(r1)
+	SAVE_GPR(0, r1)
+	SAVE_10GPRS(2, r1)
+	SAVE_8GPRS(12, r1)
+	SAVE_8GPRS(24, r1)
+	addi	r21,r1,INT_FRAME_SIZE
+	std	r21,0(r1)
+	li	r22,0
+	std	r22,0(r21)
+	ld	r2,PACATOC(r20)
+	mr	r13,r20
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.kernel_bad_stack
+	b	1b
+
 /*
 * Return from an exception which is handled without calling
 * save_remaining_regs.  The caller is assumed to have done
 * EXCEPTION_PROLOG_COMMON.
 */
 fast_exception_return:
+	andi.	r3,r23,MSR_RI		/* check if RI is set */
+	beq-	unrecov_fer
 	ld      r3,_CCR(r1)
 	ld      r4,_LINK(r1)
 	ld      r5,_CTR(r1)
@@ -639,6 +696,14 @@ fast_exception_return:
 	ld      r1,GPR1(r1)
 	rfid

+unrecov_fer:
+	li	r6,0x4000
+	li	r20,0
+	bl	.save_remaining_regs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
 /*
 * Here r20 points to the PACA, r21 to the exception frame,
 * r23 contains the saved CR.
@@ -996,6 +1061,11 @@ _GLOBAL(do_stab_bolted)
 	ld      r21,PACAEXCSP(r20)      /* Get the exception frame pointer */
 	addi    r21,r21,EXC_FRAME_SIZE
 	lwz	r23,EX_CCR(r21)		/* get saved CR */
+
+	ld	r22,EX_SRR1(r21)
+	andi.	r22,r22,MSR_RI
+	beq-	unrecov_stab
+
 	/* note that this is almost identical to maskable_exception_exit */
 	mtcr    r23                     /* restore CR */

@@ -1014,6 +1084,15 @@ _GLOBAL(do_stab_bolted)
 	mfspr	r21,SPRG1
 	rfid

+unrecov_stab:
+	EXCEPTION_PROLOG_COMMON
+	li	r6,0x4100
+	li	r20,0
+	bl	.save_remaining_regs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.unrecoverable_exception
+	b	1b
+
 /*
 * r20 points to the PACA, r21 to the exception frame,
 * r23 contains the saved CR.
@@ -1052,15 +1131,23 @@ SLB_NUM_ENTRIES = 64
 	slbmfee	r21,r22
 	srdi	r21,r21,27
 	/*
-	 * This is incorrect (r1 is not the kernel stack) if we entered
-	 * from userspace but there is no critical window from userspace
-	 * so this should be OK. Also if we cast out the userspace stack
-	 * segment while in userspace we will fault it straight back in.
+	 * Use paca->ksave as the value of the kernel stack pointer,
+	 * because this is valid at all times.
+	 * The >> 27 (rather than >> 28) is so that the LSB is the
+	 * valid bit - this way we check valid and ESID in one compare.
+	 * In order to completely close the tiny race in the context
+	 * switch (between updating r1 and updating paca->ksave),
+	 * we check against both r1 and paca->ksave.
 	 */
 	srdi	r23,r1,27
 	ori	r23,r23,1
 	cmpd	r23,r21
 	beq-	1b
+	ld	r23,PACAKSAVE(r20)
+	srdi	r23,r23,27
+ 	ori	r23,r23,1
+ 	cmpd	r23,r21
+ 	beq-	1b

 	/* r20 = paca, r22 = entry */

@@ -1117,6 +1204,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_16M_PAGE)
 	lwz	r23,EX_CCR(r21)		/* get saved CR */
 	/* note that this is almost identical to maskable_exception_exit */

+	ld	r22,EX_SRR1(r21)
+	andi.	r22,r22,MSR_RI
+	beq-	unrecov_stab
+
 	/*
 	 * Until everyone updates binutils hardwire the POWER4 optimised
 	 * single field mtcrf
@@ -1190,9 +1281,8 @@ _GLOBAL(save_remaining_regs)

 	/*
 	 * Indicate that r1 contains the kernel stack and
-	 * get the Kernel TOC and CURRENT pointers from the paca
+	 * get the Kernel TOC pointer from the paca
 	 */
-	std	r22,PACAKSAVE(r13)	/* r1 is now kernel sp */
 	ld	r2,PACATOC(r13)		/* Get Kernel TOC pointer */

 	/*
@@ -1764,7 +1854,6 @@ _GLOBAL(__secondary_start)

 	std	r2,PACATOC(r13)
 	li	r6,0
-	std	r6,PACAKSAVE(r13)
 	stb	r6,PACAPROCENABLED(r13)

 #ifndef CONFIG_PPC_ISERIES
@@ -1783,6 +1872,7 @@ _GLOBAL(__secondary_start)
 	ldx	r1,r3,r28
 	addi	r1,r1,THREAD_SIZE
 	subi	r1,r1,STACK_FRAME_OVERHEAD
+	std	r1,PACAKSAVE(r13)

 	ld	r3,PACASTABREAL(r13)    /* get raddr of segment table       */
 	ori	r4,r3,1			/* turn on valid bit                */
@@ -2055,7 +2145,7 @@ _STATIC(start_here_common)

 	std	r2,PACATOC(r13)
 	li	r5,0
-	std	r0,PACAKSAVE(r13)
+	std	r1,PACAKSAVE(r13)

 	/* Restore the parms passed in from the bootloader. */
 	mr	r3,r31

--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -161,28 +161,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	local_irq_save(flags);
 	last = _switch(old_thread, new_thread);

-	/*
-	 * force our kernel stack out of the ERAT and SLB, this is to
-	 * avoid the race where we it hangs around in the ERAT but not the
-	 * SLB and the ERAT gets invalidated at just the wrong moment by
-	 * another CPU doing a tlbie.
-	 *
-	 * We definitely dont want to flush our bolted segment, so check
-	 * for that first.
-	 */
-	if ((cur_cpu_spec->cpu_features & CPU_FTR_SLB) &&
-	    GET_ESID(__get_SP()) != GET_ESID(PAGE_OFFSET)) {
-		union {
-			unsigned long word0;
-			slb_dword0 data;
-		} esid_data;
-
-		esid_data.word0 = 0;
-		/* class bit is in valid field for slbie instruction */
-		esid_data.data.v = 1;
-		esid_data.data.esid = GET_ESID(__get_SP());
-		asm volatile("isync; slbie %0; isync" : : "r" (esid_data));
-	}
 	local_irq_restore(flags);

 	return last;

--- a/arch/ppc64/kernel/traps.c
+++ b/arch/ppc64/kernel/traps.c
@@ -511,6 +511,32 @@ AltivecAssistException(struct pt_regs *regs)
 }
 #endif /* CONFIG_ALTIVEC */

+/*
+ * We enter here if we get an unrecoverable exception, that is, one
+ * that happened at a point where the RI (recoverable interrupt) bit
+ * in the MSR is 0.  This indicates that SRR0/1 are live, and that
+ * we therefore lost state by taking this exception.
+ */
+void unrecoverable_exception(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n",
+	       regs->trap, regs->nip);
+	debugger(regs);
+	die("Unrecoverable exception", regs, SIGABRT);
+}
+
+/*
+ * We enter here if we discover during exception entry that we are
+ * running in supervisor mode with a userspace value in the stack pointer.
+ */
+void kernel_bad_stack(struct pt_regs *regs)
+{
+	printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n",
+	       regs->gpr[1], regs->nip);
+	debugger(regs);
+	die("Bad kernel stack pointer", regs, SIGABRT);
+}
+
 void __init trap_init(void)
 {
 }