Commit f8b2314c authored by Austin Clements's avatar Austin Clements

runtime: optimize defer code

This optimizes deferproc and deferreturn in various ways.

The most important optimization is that it more carefully arranges to
prevent preemption or stack growth. Currently we do this by switching
to the system stack on every deferproc and every deferreturn. While we
need to be on the system stack for the slow path of allocating and
freeing defers, in the common case we can fit in the nosplit stack.
Hence, this change pushes the system stack switch down into the slow
paths and makes everything now exposed to the user stack nosplit. This
also eliminates the need for various acquirem/releasem pairs, since we
are now preventing preemption by preventing stack split checks.

As another smaller optimization, we special case the common cases of
zero-sized and pointer-sized defer frames to respectively skip the
copy and perform the copy in line instead of calling memmove.

This speeds up the runtime defer benchmark by 42%:

name           old time/op  new time/op  delta
Defer-4        75.1ns ± 1%  43.3ns ± 1%  -42.31%   (p=0.000 n=8+10)

In reality, this speeds up defer by about 2.2X. The two benchmarks
below compare a Lock/defer Unlock pair (DeferLock) with a Lock/Unlock
pair (NoDeferLock). NoDeferLock establishes a baseline cost, so these
two benchmarks together show that this change reduces the overhead of
defer from 61.4ns to 27.9ns.

name           old time/op  new time/op  delta
DeferLock-4    77.4ns ± 1%  43.9ns ± 1%  -43.31%  (p=0.000 n=10+10)
NoDeferLock-4  16.0ns ± 0%  15.9ns ± 0%   -0.39%    (p=0.000 n=9+8)

This also shaves 34ns off cgo calls:

name       old time/op  new time/op  delta
CgoNoop-4   122ns ± 1%  88.3ns ± 1%  -27.72%  (p=0.000 n=8+9)

Updates #14939, #16051.

Change-Id: I2baa0dea378b7e4efebbee8fca919a97d5e15f38
Reviewed-on: https://go-review.googlesource.com/29656Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent d211c2d3
...@@ -6,6 +6,7 @@ package runtime ...@@ -6,6 +6,7 @@ package runtime
import ( import (
"runtime/internal/atomic" "runtime/internal/atomic"
"runtime/internal/sys"
"unsafe" "unsafe"
) )
...@@ -84,16 +85,21 @@ func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn ...@@ -84,16 +85,21 @@ func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn
argp := uintptr(unsafe.Pointer(&fn)) + unsafe.Sizeof(fn) argp := uintptr(unsafe.Pointer(&fn)) + unsafe.Sizeof(fn)
callerpc := getcallerpc(unsafe.Pointer(&siz)) callerpc := getcallerpc(unsafe.Pointer(&siz))
systemstack(func() { d := newdefer(siz)
d := newdefer(siz) if d._panic != nil {
if d._panic != nil { throw("deferproc: d.panic != nil after newdefer")
throw("deferproc: d.panic != nil after newdefer") }
} d.fn = fn
d.fn = fn d.pc = callerpc
d.pc = callerpc d.sp = sp
d.sp = sp switch siz {
memmove(add(unsafe.Pointer(d), unsafe.Sizeof(*d)), unsafe.Pointer(argp), uintptr(siz)) case 0:
}) // Do nothing.
case sys.PtrSize:
*(*uintptr)(deferArgs(d)) = *(*uintptr)(unsafe.Pointer(argp))
default:
memmove(deferArgs(d), unsafe.Pointer(argp), uintptr(siz))
}
// deferproc returns 0 normally. // deferproc returns 0 normally.
// a deferred func that stops a panic // a deferred func that stops a panic
...@@ -175,22 +181,30 @@ func init() { ...@@ -175,22 +181,30 @@ func init() {
// Allocate a Defer, usually using per-P pool. // Allocate a Defer, usually using per-P pool.
// Each defer must be released with freedefer. // Each defer must be released with freedefer.
// Note: runs on g0 stack //
// This must not grow the stack because there may be a frame without
// stack map information when this is called.
//
//go:nosplit
func newdefer(siz int32) *_defer { func newdefer(siz int32) *_defer {
var d *_defer var d *_defer
sc := deferclass(uintptr(siz)) sc := deferclass(uintptr(siz))
mp := acquirem() gp := getg()
if sc < uintptr(len(p{}.deferpool)) { if sc < uintptr(len(p{}.deferpool)) {
pp := mp.p.ptr() pp := gp.m.p.ptr()
if len(pp.deferpool[sc]) == 0 && sched.deferpool[sc] != nil { if len(pp.deferpool[sc]) == 0 && sched.deferpool[sc] != nil {
lock(&sched.deferlock) // Take the slow path on the system stack so
for len(pp.deferpool[sc]) < cap(pp.deferpool[sc])/2 && sched.deferpool[sc] != nil { // we don't grow newdefer's stack.
d := sched.deferpool[sc] systemstack(func() {
sched.deferpool[sc] = d.link lock(&sched.deferlock)
d.link = nil for len(pp.deferpool[sc]) < cap(pp.deferpool[sc])/2 && sched.deferpool[sc] != nil {
pp.deferpool[sc] = append(pp.deferpool[sc], d) d := sched.deferpool[sc]
} sched.deferpool[sc] = d.link
unlock(&sched.deferlock) d.link = nil
pp.deferpool[sc] = append(pp.deferpool[sc], d)
}
unlock(&sched.deferlock)
})
} }
if n := len(pp.deferpool[sc]); n > 0 { if n := len(pp.deferpool[sc]); n > 0 {
d = pp.deferpool[sc][n-1] d = pp.deferpool[sc][n-1]
...@@ -200,19 +214,24 @@ func newdefer(siz int32) *_defer { ...@@ -200,19 +214,24 @@ func newdefer(siz int32) *_defer {
} }
if d == nil { if d == nil {
// Allocate new defer+args. // Allocate new defer+args.
total := roundupsize(totaldefersize(uintptr(siz))) systemstack(func() {
d = (*_defer)(mallocgc(total, deferType, true)) total := roundupsize(totaldefersize(uintptr(siz)))
d = (*_defer)(mallocgc(total, deferType, true))
})
} }
d.siz = siz d.siz = siz
gp := mp.curg
d.link = gp._defer d.link = gp._defer
gp._defer = d gp._defer = d
releasem(mp)
return d return d
} }
// Free the given defer. // Free the given defer.
// The defer cannot be used after this call. // The defer cannot be used after this call.
//
// This must not grow the stack because there may be a frame without a
// stack map when this is called.
//
//go:nosplit
func freedefer(d *_defer) { func freedefer(d *_defer) {
if d._panic != nil { if d._panic != nil {
freedeferpanic() freedeferpanic()
...@@ -222,31 +241,34 @@ func freedefer(d *_defer) { ...@@ -222,31 +241,34 @@ func freedefer(d *_defer) {
} }
sc := deferclass(uintptr(d.siz)) sc := deferclass(uintptr(d.siz))
if sc < uintptr(len(p{}.deferpool)) { if sc < uintptr(len(p{}.deferpool)) {
mp := acquirem() pp := getg().m.p.ptr()
pp := mp.p.ptr()
if len(pp.deferpool[sc]) == cap(pp.deferpool[sc]) { if len(pp.deferpool[sc]) == cap(pp.deferpool[sc]) {
// Transfer half of local cache to the central cache. // Transfer half of local cache to the central cache.
var first, last *_defer //
for len(pp.deferpool[sc]) > cap(pp.deferpool[sc])/2 { // Take this slow path on the system stack so
n := len(pp.deferpool[sc]) // we don't grow freedefer's stack.
d := pp.deferpool[sc][n-1] systemstack(func() {
pp.deferpool[sc][n-1] = nil var first, last *_defer
pp.deferpool[sc] = pp.deferpool[sc][:n-1] for len(pp.deferpool[sc]) > cap(pp.deferpool[sc])/2 {
if first == nil { n := len(pp.deferpool[sc])
first = d d := pp.deferpool[sc][n-1]
} else { pp.deferpool[sc][n-1] = nil
last.link = d pp.deferpool[sc] = pp.deferpool[sc][:n-1]
if first == nil {
first = d
} else {
last.link = d
}
last = d
} }
last = d lock(&sched.deferlock)
} last.link = sched.deferpool[sc]
lock(&sched.deferlock) sched.deferpool[sc] = first
last.link = sched.deferpool[sc] unlock(&sched.deferlock)
sched.deferpool[sc] = first })
unlock(&sched.deferlock)
} }
*d = _defer{} *d = _defer{}
pp.deferpool[sc] = append(pp.deferpool[sc], d) pp.deferpool[sc] = append(pp.deferpool[sc], d)
releasem(mp)
} }
} }
...@@ -288,19 +310,23 @@ func deferreturn(arg0 uintptr) { ...@@ -288,19 +310,23 @@ func deferreturn(arg0 uintptr) {
} }
// Moving arguments around. // Moving arguments around.
// Do not allow preemption here, because the garbage collector //
// won't know the form of the arguments until the jmpdefer can // Everything called after this point must be recursively
// flip the PC over to fn. // nosplit because the garbage collector won't know the form
mp := acquirem() // of the arguments until the jmpdefer can flip the PC over to
memmove(unsafe.Pointer(&arg0), deferArgs(d), uintptr(d.siz)) // fn.
switch d.siz {
case 0:
// Do nothing.
case sys.PtrSize:
*(*uintptr)(unsafe.Pointer(&arg0)) = *(*uintptr)(deferArgs(d))
default:
memmove(unsafe.Pointer(&arg0), deferArgs(d), uintptr(d.siz))
}
fn := d.fn fn := d.fn
d.fn = nil d.fn = nil
gp._defer = d.link gp._defer = d.link
// Switch to systemstack merely to save nosplit stack space. freedefer(d)
systemstack(func() {
freedefer(d)
})
releasem(mp)
jmpdefer(fn, uintptr(unsafe.Pointer(&arg0))) jmpdefer(fn, uintptr(unsafe.Pointer(&arg0)))
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment