Commit db7fd1c1 authored by Rick Hudson's avatar Rick Hudson

runtime: increase GC concurrency.

run GC in its own background goroutine making the
caller runnable if resources are available. This is
critical in single goroutine applications.
Allow goroutines that allocate a lot to help out
the GC and in doing so throttle their own allocation.
Adjust test so that it only detects that a GC is run
during init calls and not whether the GC is memory
efficient. Memory efficiency work will happen later
in 1.5.

Change-Id: I4306f5e377bb47c69bda1aedba66164f12b20c2b
Reviewed-on: https://go-review.googlesource.com/2349Reviewed-by: default avatarRuss Cox <rsc@golang.org>
Reviewed-by: default avatarAustin Clements <austin@google.com>
parent f21ee1e1
...@@ -39,10 +39,27 @@ type pageID uintptr ...@@ -39,10 +39,27 @@ type pageID uintptr
// base address for all 0-byte allocations // base address for all 0-byte allocations
var zerobase uintptr var zerobase uintptr
// Determine whether to initiate a GC.
// Currently the primitive heuristic we use will start a new
// concurrent GC when approximately half the available space
// made available by the last GC cycle has been used.
// If the GC is already working no need to trigger another one.
// This should establish a feedback loop where if the GC does not
// have sufficient time to complete then more memory will be
// requested from the OS increasing heap size thus allow future
// GCs more time to complete.
// memstat.heap_alloc and memstat.next_gc reads have benign races
// A false negative simple does not start a GC, a false positive
// will start a GC needlessly. Neither have correctness issues.
func shouldtriggergc() bool {
return memstats.heap_alloc+memstats.heap_alloc*3/4 >= memstats.next_gc && atomicloaduint(&bggc.working) == 0
}
// Allocate an object of size bytes. // Allocate an object of size bytes.
// Small objects are allocated from the per-P cache's free lists. // Small objects are allocated from the per-P cache's free lists.
// Large objects (> 32 kB) are allocated straight from the heap. // Large objects (> 32 kB) are allocated straight from the heap.
func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
shouldhelpgc := false
if size == 0 { if size == 0 {
return unsafe.Pointer(&zerobase) return unsafe.Pointer(&zerobase)
} }
...@@ -144,6 +161,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { ...@@ -144,6 +161,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
systemstack(func() { systemstack(func() {
mCache_Refill(c, tinySizeClass) mCache_Refill(c, tinySizeClass)
}) })
shouldhelpgc = true
s = c.alloc[tinySizeClass] s = c.alloc[tinySizeClass]
v = s.freelist v = s.freelist
} }
...@@ -174,6 +192,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { ...@@ -174,6 +192,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
systemstack(func() { systemstack(func() {
mCache_Refill(c, int32(sizeclass)) mCache_Refill(c, int32(sizeclass))
}) })
shouldhelpgc = true
s = c.alloc[sizeclass] s = c.alloc[sizeclass]
v = s.freelist v = s.freelist
} }
...@@ -191,6 +210,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { ...@@ -191,6 +210,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
c.local_cachealloc += intptr(size) c.local_cachealloc += intptr(size)
} else { } else {
var s *mspan var s *mspan
shouldhelpgc = true
systemstack(func() { systemstack(func() {
s = largeAlloc(size, uint32(flags)) s = largeAlloc(size, uint32(flags))
}) })
...@@ -345,8 +365,15 @@ marked: ...@@ -345,8 +365,15 @@ marked:
} }
} }
if memstats.heap_alloc >= memstats.next_gc/2 { if shouldtriggergc() {
gogc(0) gogc(0)
} else if shouldhelpgc && atomicloaduint(&bggc.working) == 1 {
// bggc.lock not taken since race on bggc.working is benign.
// At worse we don't call gchelpwork.
// Delay the gchelpwork until the epilogue so that it doesn't
// interfere with the inner working of malloc such as
// mcache refills that might happen while doing the gchelpwork
systemstack(gchelpwork)
} }
return x return x
...@@ -466,14 +493,25 @@ func gogc(force int32) { ...@@ -466,14 +493,25 @@ func gogc(force int32) {
releasem(mp) releasem(mp)
mp = nil mp = nil
semacquire(&worldsema, false) if force == 0 {
lock(&bggc.lock)
if force == 0 && memstats.heap_alloc < memstats.next_gc { if !bggc.started {
// typically threads which lost the race to grab bggc.working = 1
// worldsema exit here when gc is done. bggc.started = true
semrelease(&worldsema) go backgroundgc()
return } else if bggc.working == 0 {
bggc.working = 1
ready(bggc.g)
}
unlock(&bggc.lock)
} else {
gcwork(force)
} }
}
func gcwork(force int32) {
semacquire(&worldsema, false)
// Pick up the remaining unswept/not being swept spans concurrently // Pick up the remaining unswept/not being swept spans concurrently
for gosweepone() != ^uintptr(0) { for gosweepone() != ^uintptr(0) {
...@@ -482,14 +520,17 @@ func gogc(force int32) { ...@@ -482,14 +520,17 @@ func gogc(force int32) {
// Ok, we're doing it! Stop everybody else // Ok, we're doing it! Stop everybody else
startTime := nanotime() mp := acquirem()
mp = acquirem()
mp.gcing = 1 mp.gcing = 1
releasem(mp) releasem(mp)
gctimer.count++ gctimer.count++
if force == 0 { if force == 0 {
gctimer.cycle.sweepterm = nanotime() gctimer.cycle.sweepterm = nanotime()
} }
// Pick up the remaining unswept/not being swept spans before we STW
for gosweepone() != ^uintptr(0) {
sweep.nbgsweep++
}
systemstack(stoptheworld) systemstack(stoptheworld)
systemstack(finishsweep_m) // finish sweep before we start concurrent scan. systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
if force == 0 { // Do as much work concurrently as possible if force == 0 { // Do as much work concurrently as possible
...@@ -500,7 +541,7 @@ func gogc(force int32) { ...@@ -500,7 +541,7 @@ func gogc(force int32) {
systemstack(gcscan_m) systemstack(gcscan_m)
gctimer.cycle.installmarkwb = nanotime() gctimer.cycle.installmarkwb = nanotime()
systemstack(stoptheworld) systemstack(stoptheworld)
gcinstallmarkwb() systemstack(gcinstallmarkwb)
systemstack(starttheworld) systemstack(starttheworld)
gctimer.cycle.mark = nanotime() gctimer.cycle.mark = nanotime()
systemstack(gcmark_m) systemstack(gcmark_m)
...@@ -509,6 +550,7 @@ func gogc(force int32) { ...@@ -509,6 +550,7 @@ func gogc(force int32) {
systemstack(gcinstalloffwb_m) systemstack(gcinstalloffwb_m)
} }
startTime := nanotime()
if mp != acquirem() { if mp != acquirem() {
throw("gogc: rescheduled") throw("gogc: rescheduled")
} }
...@@ -527,6 +569,7 @@ func gogc(force int32) { ...@@ -527,6 +569,7 @@ func gogc(force int32) {
eagersweep := force >= 2 eagersweep := force >= 2
for i := 0; i < n; i++ { for i := 0; i < n; i++ {
if i > 0 { if i > 0 {
// refresh start time if doing a second GC
startTime = nanotime() startTime = nanotime()
} }
// switch to g0, call gc, then switch back // switch to g0, call gc, then switch back
...@@ -579,8 +622,8 @@ func GCcheckmarkdisable() { ...@@ -579,8 +622,8 @@ func GCcheckmarkdisable() {
// gctimes records the time in nanoseconds of each phase of the concurrent GC. // gctimes records the time in nanoseconds of each phase of the concurrent GC.
type gctimes struct { type gctimes struct {
sweepterm int64 // stw sweepterm int64 // stw
scan int64 // stw scan int64
installmarkwb int64 installmarkwb int64 // stw
mark int64 mark int64
markterm int64 // stw markterm int64 // stw
sweep int64 sweep int64
...@@ -601,7 +644,7 @@ type gcchronograph struct { ...@@ -601,7 +644,7 @@ type gcchronograph struct {
var gctimer gcchronograph var gctimer gcchronograph
// GCstarttimes initializes the gc timess. All previous timess are lost. // GCstarttimes initializes the gc times. All previous times are lost.
func GCstarttimes(verbose int64) { func GCstarttimes(verbose int64) {
gctimer = gcchronograph{verbose: verbose} gctimer = gcchronograph{verbose: verbose}
} }
...@@ -655,6 +698,11 @@ func calctimes() gctimes { ...@@ -655,6 +698,11 @@ func calctimes() gctimes {
// the information from the most recent Concurent GC cycle. Calls from the // the information from the most recent Concurent GC cycle. Calls from the
// application to runtime.GC() are ignored. // application to runtime.GC() are ignored.
func GCprinttimes() { func GCprinttimes() {
if gctimer.verbose == 0 {
println("GC timers not enabled")
return
}
// Explicitly put times on the heap so printPhase can use it. // Explicitly put times on the heap so printPhase can use it.
times := new(gctimes) times := new(gctimes)
*times = calctimes() *times = calctimes()
......
...@@ -123,7 +123,7 @@ const ( ...@@ -123,7 +123,7 @@ const (
_DebugGCPtrs = false // if true, print trace of every pointer load during GC _DebugGCPtrs = false // if true, print trace of every pointer load during GC
_ConcurrentSweep = true _ConcurrentSweep = true
_WorkbufSize = 4 * 1024 _WorkbufSize = 4 * 256
_FinBlockSize = 4 * 1024 _FinBlockSize = 4 * 1024
_RootData = 0 _RootData = 0
_RootBss = 1 _RootBss = 1
...@@ -191,9 +191,9 @@ var badblock [1024]uintptr ...@@ -191,9 +191,9 @@ var badblock [1024]uintptr
var nbadblock int32 var nbadblock int32
type workdata struct { type workdata struct {
full uint64 // lock-free list of full blocks full uint64 // lock-free list of full blocks workbuf
empty uint64 // lock-free list of empty blocks empty uint64 // lock-free list of empty blocks workbuf
partial uint64 // lock-free list of partially filled blocks partial uint64 // lock-free list of partially filled blocks workbuf
pad0 [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait pad0 [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
nproc uint32 nproc uint32
tstart int64 tstart int64
...@@ -587,6 +587,11 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) { ...@@ -587,6 +587,11 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) {
// base and extent. // base and extent.
b := b0 b := b0
n := n0 n := n0
// ptrmask can have 2 possible values:
// 1. nil - obtain pointer mask from GC bitmap.
// 2. pointer to a compact mask (for stacks and data).
wbuf := getpartialorempty() wbuf := getpartialorempty()
if b != 0 { if b != 0 {
wbuf = scanobject(b, n, ptrmask, wbuf) wbuf = scanobject(b, n, ptrmask, wbuf)
...@@ -600,23 +605,23 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) { ...@@ -600,23 +605,23 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) {
return return
} }
} }
if gcphase == _GCscan {
throw("scanblock: In GCscan phase but no b passed in.")
}
keepworking := b == 0 drainallwbufs := b == 0
drainworkbuf(wbuf, drainallwbufs)
}
// Scan objects in wbuf until wbuf is empty.
// If drainallwbufs is true find all other available workbufs and repeat the process.
//go:nowritebarrier
func drainworkbuf(wbuf *workbuf, drainallwbufs bool) {
if gcphase != _GCmark && gcphase != _GCmarktermination { if gcphase != _GCmark && gcphase != _GCmarktermination {
println("gcphase", gcphase) println("gcphase", gcphase)
throw("scanblock phase") throw("scanblock phase")
} }
// ptrmask can have 2 possible values:
// 1. nil - obtain pointer mask from GC bitmap.
// 2. pointer to a compact mask (for stacks and data).
for { for {
if wbuf.nobj == 0 { if wbuf.nobj == 0 {
if !keepworking { if !drainallwbufs {
putempty(wbuf) putempty(wbuf)
return return
} }
...@@ -641,9 +646,30 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) { ...@@ -641,9 +646,30 @@ func scanblock(b0, n0 uintptr, ptrmask *uint8) {
// PREFETCH(wbuf->obj[wbuf->nobj - 3]; // PREFETCH(wbuf->obj[wbuf->nobj - 3];
// } // }
wbuf.nobj-- wbuf.nobj--
b = wbuf.obj[wbuf.nobj] b := wbuf.obj[wbuf.nobj]
wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf)
}
}
// Scan at most count objects in the wbuf.
//go:nowritebarrier
func drainobjects(wbuf *workbuf, count uintptr) {
for i := uintptr(0); i < count; i++ {
if wbuf.nobj == 0 {
putempty(wbuf)
return
}
// This might be a good place to add prefetch code...
// if(wbuf->nobj > 4) {
// PREFETCH(wbuf->obj[wbuf->nobj - 3];
// }
wbuf.nobj--
b := wbuf.obj[wbuf.nobj]
wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf) wbuf = scanobject(b, mheap_.arena_used-b, nil, wbuf)
} }
putpartial(wbuf)
return
} }
//go:nowritebarrier //go:nowritebarrier
...@@ -809,6 +835,17 @@ func putpartial(b *workbuf) { ...@@ -809,6 +835,17 @@ func putpartial(b *workbuf) {
} }
} }
// trygetfull tries to get a full or partially empty workbuffer.
// if one is not immediately available return nil
//go:nowritebarrier
func trygetfull() *workbuf {
wbuf := (*workbuf)(lfstackpop(&work.full))
if wbuf == nil {
wbuf = (*workbuf)(lfstackpop(&work.partial))
}
return wbuf
}
// Get a full work buffer off the work.full or a partially // Get a full work buffer off the work.full or a partially
// filled one off the work.partial list. If nothing is available // filled one off the work.partial list. If nothing is available
// wait until all the other gc helpers have finished and then // wait until all the other gc helpers have finished and then
...@@ -1090,6 +1127,38 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) { ...@@ -1090,6 +1127,38 @@ func gcmarkwb_m(slot *uintptr, ptr uintptr) {
} }
} }
// gchelpwork does a small bounded amount of gc work. The purpose is to
// shorten the time (as measured by allocations) spent doing a concurrent GC.
// The number of mutator calls is roughly propotional to the number of allocations
// made by that mutator. This slows down the allocation while speeding up the GC.
//go:nowritebarrier
func gchelpwork() {
switch gcphase {
default:
throw("gcphasework in bad gcphase")
case _GCoff, _GCquiesce, _GCstw:
// No work.
case _GCsweep:
// We could help by calling sweepone to sweep a single span.
// _ = sweepone()
case _GCscan:
// scan the stack, mark the objects, put pointers in work buffers
// hanging off the P where this is being run.
// scanstack(gp)
case _GCmark:
// Get a full work buffer and empty it.
var wbuf *workbuf
wbuf = trygetfull()
if wbuf != nil {
drainobjects(wbuf, uintptr(len(wbuf.obj))) // drain upto one buffer's worth of objects
}
case _GCmarktermination:
// We should never be here since the world is stopped.
// All available mark work will be emptied before returning.
throw("gcphasework in bad gcphase")
}
}
// The gp has been moved to a GC safepoint. GC phase specific // The gp has been moved to a GC safepoint. GC phase specific
// work is done here. // work is done here.
//go:nowritebarrier //go:nowritebarrier
...@@ -1425,6 +1494,14 @@ type sweepdata struct { ...@@ -1425,6 +1494,14 @@ type sweepdata struct {
var sweep sweepdata var sweep sweepdata
// State of the background concurrent GC goroutine.
var bggc struct {
lock mutex
g *g
working uint
started bool
}
// sweeps one span // sweeps one span
// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep // returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
//go:nowritebarrier //go:nowritebarrier
......
...@@ -78,6 +78,19 @@ func clearpools() { ...@@ -78,6 +78,19 @@ func clearpools() {
} }
} }
// backgroundgc is running in a goroutine and does the concurrent GC work.
// bggc holds the state of the backgroundgc.
func backgroundgc() {
bggc.g = getg()
bggc.g.issystem = true
for {
gcwork(0)
lock(&bggc.lock)
bggc.working = 0
goparkunlock(&bggc.lock, "Concurrent GC wait")
}
}
func bgsweep() { func bgsweep() {
sweep.g = getg() sweep.g = getg()
getg().issystem = true getg().issystem = true
......
...@@ -31,7 +31,7 @@ func init() { ...@@ -31,7 +31,7 @@ func init() {
} }
runtime.ReadMemStats(memstats) runtime.ReadMemStats(memstats)
sys1 := memstats.Sys sys1 := memstats.Sys
if sys1-sys > chunk*50 { if sys1-sys > chunk*500 {
println("allocated 1000 chunks of", chunk, "and used ", sys1-sys, "memory") println("allocated 1000 chunks of", chunk, "and used ", sys1-sys, "memory")
panic("init1") panic("init1")
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment