runtime: call amd64 VDSO entry points on large stack

If the Linux kernel was built with CONFIG_OPTIMIZE_INLINING=n and was built with hardening options turned on, GCC will insert a stack probe in the VDSO function that requires a full page of stack space. The stack probe can corrupt memory if another thread is using it. Avoid sporadic crashes by calling the VDSO on the g0 or gsignal stack. While we're at it, align the stack as C code expects. We've been getting away with a misaligned stack, but it's possible that the VDSO code will change in the future to break that assumption. Benchmarks show a 11% hit on time.Now, but it's only 6ns. name old time/op new time/op delta AfterFunc-12 1.66ms ± 0% 1.66ms ± 1% ~ (p=0.905 n=9+10) After-12 1.90ms ± 6% 1.86ms ± 0% -2.05% (p=0.012 n=10+8) Stop-12 113µs ± 3% 115µs ± 2% +1.60% (p=0.017 n=9+10) SimultaneousAfterFunc-12 145µs ± 1% 144µs ± 0% -0.68% (p=0.002 n=10+8) StartStop-12 39.5µs ± 3% 40.4µs ± 5% +2.19% (p=0.023 n=10+10) Reset-12 10.2µs ± 0% 10.4µs ± 0% +2.45% (p=0.000 n=10+9) Sleep-12 190µs ± 1% 190µs ± 1% ~ (p=0.971 n=10+10) Ticker-12 4.68ms ± 2% 4.64ms ± 2% -0.83% (p=0.043 n=9+10) Now-12 48.4ns ±11% 54.0ns ±11% +11.42% (p=0.017 n=10+10) NowUnixNano-12 48.5ns ±13% 56.9ns ± 8% +17.30% (p=0.000 n=10+10) Format-12 489ns ±11% 504ns ± 6% ~ (p=0.289 n=10+10) FormatNow-12 436ns ±23% 480ns ±13% +10.25% (p=0.026 n=9+10) MarshalJSON-12 656ns ±14% 587ns ±24% ~ (p=0.063 n=10+10) MarshalText-12 647ns ± 7% 638ns ± 9% ~ (p=0.516 n=10+10) Parse-12 348ns ± 8% 328ns ± 9% -5.66% (p=0.030 n=10+10) ParseDuration-12 136ns ± 9% 140ns ±11% ~ (p=0.425 n=10+10) Hour-12 14.8ns ± 6% 15.6ns ±11% ~ (p=0.085 n=10+10) Second-12 14.0ns ± 6% 14.3ns ±12% ~ (p=0.443 n=10+10) Year-12 32.4ns ±11% 33.4ns ± 6% ~ (p=0.492 n=10+10) Day-12 41.5ns ± 9% 42.3ns ±12% ~ (p=0.239 n=10+10) Fixes #20427 Change-Id: Ia395cbb863215f4499b8e7ef95f4b99f51090911 Reviewed-on: https://go-review.googlesource.com/76990Reviewed-by: Austin Clements <austin@google.com>

runtime: call amd64 VDSO entry points on large stack
If the Linux kernel was built with CONFIG_OPTIMIZE_INLINING=n and was built with hardening options turned on, GCC will insert a stack probe in the VDSO function that requires a full page of stack space. The stack probe can corrupt memory if another thread is using it. Avoid sporadic crashes by calling the VDSO on the g0 or gsignal stack. While we're at it, align the stack as C code expects. We've been getting away with a misaligned stack, but it's possible that the VDSO code will change in the future to break that assumption. Benchmarks show a 11% hit on time.Now, but it's only 6ns. name old time/op new time/op delta AfterFunc-12 1.66ms ± 0% 1.66ms ± 1% ~ (p=0.905 n=9+10) After-12 1.90ms ± 6% 1.86ms ± 0% -2.05% (p=0.012 n=10+8) Stop-12 113µs ± 3% 115µs ± 2% +1.60% (p=0.017 n=9+10) SimultaneousAfterFunc-12 145µs ± 1% 144µs ± 0% -0.68% (p=0.002 n=10+8) StartStop-12 39.5µs ± 3% 40.4µs ± 5% +2.19% (p=0.023 n=10+10) Reset-12 10.2µs ± 0% 10.4µs ± 0% +2.45% (p=0.000 n=10+9) Sleep-12 190µs ± 1% 190µs ± 1% ~ (p=0.971 n=10+10) Ticker-12 4.68ms ± 2% 4.64ms ± 2% -0.83% (p=0.043 n=9+10) Now-12 48.4ns ±11% 54.0ns ±11% +11.42% (p=0.017 n=10+10) NowUnixNano-12 48.5ns ±13% 56.9ns ± 8% +17.30% (p=0.000 n=10+10) Format-12 489ns ±11% 504ns ± 6% ~ (p=0.289 n=10+10) FormatNow-12 436ns ±23% 480ns ±13% +10.25% (p=0.026 n=9+10) MarshalJSON-12 656ns ±14% 587ns ±24% ~ (p=0.063 n=10+10) MarshalText-12 647ns ± 7% 638ns ± 9% ~ (p=0.516 n=10+10) Parse-12 348ns ± 8% 328ns ± 9% -5.66% (p=0.030 n=10+10) ParseDuration-12 136ns ± 9% 140ns ±11% ~ (p=0.425 n=10+10) Hour-12 14.8ns ± 6% 15.6ns ±11% ~ (p=0.085 n=10+10) Second-12 14.0ns ± 6% 14.3ns ±12% ~ (p=0.443 n=10+10) Year-12 32.4ns ±11% 33.4ns ± 6% ~ (p=0.492 n=10+10) Day-12 41.5ns ± 9% 42.3ns ±12% ~ (p=0.239 n=10+10) Fixes #20427 Change-Id: Ia395cbb863215f4499b8e7ef95f4b99f51090911 Reviewed-on: https://go-review.googlesource.com/76990Reviewed-by: Austin Clements <austin@google.com>
a158382b · Ian Lance Taylor · fcee1897 · a158382b · a158382b
Commit a158382b authored Nov 10, 2017 by Ian Lance Taylor
Show whitespace changes
Inline Side-by-side

Showing with 95 additions and 15 deletions

src/runtime/sys_linux_386.s src/runtime/sys_linux_386.s +46 -7

src/runtime/sys_linux_amd64.s src/runtime/sys_linux_amd64.s +49 -8

No files found.
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -203,17 +203,34 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-16
 	RET
 // func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $16
+TEXT runtime·walltime(SB), NOSPLIT, $0-12
+	// We don't know how much stack space the VDSO code will need,
+	// so switch to g0.
+	MOVL	SP, BP	// Save old SP; BP unchanged by C code.
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), CX
+	MOVL	m_curg(CX), DX
+	CMPL	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+	MOVL	m_g0(CX), DX
+	MOVL	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+noswitch:
+	SUBL	$16, SP		// Space for results
+	ANDL	$~15, SP	// Align for C code
 	// Stack layout, depending on call path:
 	//  x(SP)   vDSO            INVOKE_SYSCALL
 	//    12    ts.tv_nsec      ts.tv_nsec
 	//     8    ts.tv_sec       ts.tv_sec
 	//     4    &ts             -
 	//     0    CLOCK_<id>      -
-	//
-	// If we take the vDSO path, we're calling a function with gcc calling convention.
-	// We're guaranteed 128 bytes on entry. We've taken 16, and the call uses another 4,
-	// leaving 108 for __vdso_clock_gettime to use.
 	MOVL	runtime·__vdso_clock_gettime_sym(SB), AX
 	CMPL	AX, $0
 	JEQ	fallback
@@ -234,6 +251,8 @@ finish:
 	MOVL	8(SP), AX	// sec
 	MOVL	12(SP), BX	// nsec
+	MOVL	BP, SP		// Restore real SP
 	// sec is in AX, nsec in BX
 	MOVL	AX, sec_lo+0(FP)
 	MOVL	$0, sec_hi+4(FP)
@@ -242,8 +261,26 @@ finish:
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), NOSPLIT, $16
+TEXT runtime·nanotime(SB), NOSPLIT, $0-8
-	// See comments above in walltime() about stack space usage and layout.
+	// Switch to g0 stack. See comment above in runtime·walltime.
+	MOVL	SP, BP	// Save old SP; BP unchanged by C code.
+	get_tls(CX)
+	MOVL	g(CX), AX
+	MOVL	g_m(AX), CX
+	MOVL	m_curg(CX), DX
+	CMPL	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+	MOVL	m_g0(CX), DX
+	MOVL	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+noswitch:
+	SUBL	$16, SP		// Space for results
+	ANDL	$~15, SP	// Align for C code
 	MOVL	runtime·__vdso_clock_gettime_sym(SB), AX
 	CMPL	AX, $0
 	JEQ	fallback
@@ -264,6 +301,8 @@ finish:
 	MOVL	8(SP), AX	// sec
 	MOVL	12(SP), BX	// nsec
+	MOVL	BP, SP		// Restore real SP
 	// sec is in AX, nsec in BX
 	// convert to DX:AX nsec
 	MOVL	$1000000000, CX

--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -181,11 +181,31 @@ TEXT runtime·mincore(SB),NOSPLIT,$0-28
 	RET
 // func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$16
+TEXT runtime·walltime(SB),NOSPLIT,$0-12
-	// Be careful. We're calling a function with gcc calling convention here.
+	// We don't know how much stack space the VDSO code will need,
-	// We're guaranteed 128 bytes on entry, and we've taken 16, and the
+	// so switch to g0.
-	// call uses another 8.
+	// In particular, a kernel configured with CONFIG_OPTIMIZE_INLINING=n
-	// That leaves 104 for the gettime code to use. Hope that's enough!
+	// and hardening can use a full page of stack space in gettime_sym
+	// due to stack probes inserted to avoid stack/heap collisions.
+	// See issue #20427.
+	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), CX
+	MOVQ	m_curg(CX), DX
+	CMPQ	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+	MOVQ	m_g0(CX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+noswitch:
+	SUBQ	$16, SP		// Space for results
+	ANDQ	$~15, SP	// Align for C code
 	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
 	CMPQ	AX, $0
 	JEQ	fallback
@@ -194,6 +214,7 @@ TEXT runtime·walltime(SB),NOSPLIT,$16
 	CALL	AX
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
+	MOVQ	BP, SP		// Restore real SP
 	MOVQ	AX, sec+0(FP)
 	MOVL	DX, nsec+8(FP)
 	RET
@@ -205,13 +226,31 @@ fallback:
 	MOVQ	0(SP), AX	// sec
 	MOVL	8(SP), DX	// usec
 	IMULQ	$1000, DX
+	MOVQ	BP, SP		// Restore real SP
 	MOVQ	AX, sec+0(FP)
 	MOVL	DX, nsec+8(FP)
 	RET
-TEXT runtime·nanotime(SB),NOSPLIT,$16
+TEXT runtime·nanotime(SB),NOSPLIT,$0-8
-	// Duplicate time.now here to avoid using up precious stack space.
+	// Switch to g0 stack. See comment above in runtime·walltime.
-	// See comment above in time.now.
+	MOVQ	SP, BP	// Save old SP; BX unchanged by C code.
+	get_tls(CX)
+	MOVQ	g(CX), AX
+	MOVQ	g_m(AX), CX
+	MOVQ	m_curg(CX), DX
+	CMPQ	AX, DX		// Only switch if on curg.
+	JNE	noswitch
+	MOVQ	m_g0(CX), DX
+	MOVQ	(g_sched+gobuf_sp)(DX), SP	// Set SP to g0 stack
+noswitch:
+	SUBQ	$16, SP		// Space for results
+	ANDQ	$~15, SP	// Align for C code
 	MOVQ	runtime·__vdso_clock_gettime_sym(SB), AX
 	CMPQ	AX, $0
 	JEQ	fallback
@@ -220,6 +259,7 @@ TEXT runtime·nanotime(SB),NOSPLIT,$16
 	CALL	AX
 	MOVQ	0(SP), AX	// sec
 	MOVQ	8(SP), DX	// nsec
+	MOVQ	BP, SP		// Restore real SP
 	// sec is in AX, nsec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
@@ -233,6 +273,7 @@ fallback:
 	CALL	AX
 	MOVQ	0(SP), AX	// sec
 	MOVL	8(SP), DX	// usec
+	MOVQ	BP, SP		// Restore real SP
 	IMULQ	$1000, DX
 	// sec is in AX, nsec in DX
 	// return nsec in AX