Commit 845aa1fc authored by Dmitriy Vyukov's avatar Dmitriy Vyukov Committed by Russ Cox

runtime: faster GC sweep phase

benchmark                              old ns/op    new ns/op    delta

garbage.BenchmarkParser               3731065750   3715543750   -0.41%
garbage.BenchmarkParser-2             3631299750   3495248500   -3.75%
garbage.BenchmarkParser-4             3386486000   3339353000   -1.39%
garbage.BenchmarkParser-8             3267632000   3286422500   +0.58%
garbage.BenchmarkParser-16            3299203000   3316081750   +0.51%

garbage.BenchmarkTree                  977532888    919453833   -5.94%
garbage.BenchmarkTree-2                919948555    853478000   -7.23%
garbage.BenchmarkTree-4                841329000    790207000   -6.08%
garbage.BenchmarkTree-8                787792777    740380666   -6.01%
garbage.BenchmarkTree-16               899257166    846594555   -5.86%

garbage.BenchmarkTree2                 574876300    571885800   -0.52%
garbage.BenchmarkTree2-2               348162700    345888900   -0.65%
garbage.BenchmarkTree2-4               184912500    179137000   -3.22%
garbage.BenchmarkTree2-8               104243900    103485600   -0.73%
garbage.BenchmarkTree2-16               97269500     85137100  -14.25%

garbage.BenchmarkParserPause           141101976    157746974  +11.80%
garbage.BenchmarkParserPause-2         103096051     83043048  -19.45%
garbage.BenchmarkParserPause-4          52153133     45951111  -11.89%
garbage.BenchmarkParserPause-8          36730190     38901024   +5.91%
garbage.BenchmarkParserPause-16         32678875     29578585   -9.49%

garbage.BenchmarkTreePause              29487065     29648439   +0.55%
garbage.BenchmarkTreePause-2            22443494     21306159   -5.07%
garbage.BenchmarkTreePause-4            15799691     14985647   -5.15%
garbage.BenchmarkTreePause-8            10768112     9531420   -12.97%
garbage.BenchmarkTreePause-16           16329891     15205158   -6.89%

garbage.BenchmarkTree2Pause           2586957240   2577533200   -0.36%
garbage.BenchmarkTree2Pause-2         1683383760   1673923800   -0.56%
garbage.BenchmarkTree2Pause-4         1102860320   1074040280   -2.68%
garbage.BenchmarkTree2Pause-8          902627920    886122400   -1.86%
garbage.BenchmarkTree2Pause-16         856470920    804152320   -6.50%

garbage.BenchmarkParserLastPause       277316000    280839000   +1.25%
garbage.BenchmarkParserLastPause-2     179446000    163687000   -8.78%
garbage.BenchmarkParserLastPause-4     106752000     94144000  -11.81%
garbage.BenchmarkParserLastPause-8      57758000     61640000   +6.72%
garbage.BenchmarkParserLastPause-16     51235000     42552000  -16.95%

garbage.BenchmarkTreeLastPause          45244000     50786000  +12.25%
garbage.BenchmarkTreeLastPause-2        37163000     34654000   -6.75%
garbage.BenchmarkTreeLastPause-4        24178000     21967000   -9.14%
garbage.BenchmarkTreeLastPause-8        20390000     15648000  -30.30%
garbage.BenchmarkTreeLastPause-16       22398000     20180000   -9.90%

garbage.BenchmarkTree2LastPause       5748706000   5718809000   -0.52%
garbage.BenchmarkTree2LastPause-2     3481570000   3458844000   -0.65%
garbage.BenchmarkTree2LastPause-4     1849073000   1791330000   -3.22%
garbage.BenchmarkTree2LastPause-8     1042375000   1034811000   -0.73%
garbage.BenchmarkTree2LastPause-16     972637000    851323000  -14.25%

There is also visible improvement in consumed CPU time:
tree2 -heapsize=8000000000 -cpus=12
before: 248.74user 6.36system 0:52.74elapsed 483%CPU
after:  229.86user 6.33system 0:51.08elapsed 462%CPU
-1.66s of real time, but -18.91s of consumed CPU time

R=golang-dev
CC=golang-dev
https://golang.org/cl/6215065
parent 581e7c2a
......@@ -120,10 +120,9 @@ static struct {
uint32 nproc;
volatile uint32 nwait;
volatile uint32 ndone;
volatile uint32 debugmarkdone;
Note alldone;
Lock markgate;
Lock sweepgate;
uint32 spanidx;
ParFor *sweepfor;
Lock;
byte *chunk;
......@@ -720,40 +719,10 @@ handlespecial(byte *p, uintptr size)
return true;
}
static void sweepspan(MSpan *s);
// Sweep frees or collects finalizers for blocks not marked in the mark phase.
// It clears the mark bits in preparation for the next GC round.
static void
sweep(void)
{
MSpan *s, **allspans;
int64 now;
uint32 spanidx, nspan;
now = runtime·nanotime();
nspan = runtime·mheap.nspan;
allspans = runtime·mheap.allspans;
for(;;) {
spanidx = runtime·xadd(&work.spanidx, 1) - 1;
if(spanidx >= nspan)
break;
s = allspans[spanidx];
// Stamp newly unused spans. The scavenger will use that
// info to potentially give back some pages to the OS.
if(s->state == MSpanFree && s->unusedsince == 0)
s->unusedsince = now;
if(s->state != MSpanInUse)
continue;
sweepspan(s);
}
}
static void
sweepspan(MSpan *s)
sweepspan(ParFor *desc, uint32 idx)
{
int32 cl, n, npages;
uintptr size;
......@@ -762,7 +731,16 @@ sweepspan(MSpan *s)
byte *arena_start;
MLink *start, *end;
int32 nfree;
MSpan *s;
USED(&desc);
s = runtime·mheap.allspans[idx];
// Stamp newly unused spans. The scavenger will use that
// info to potentially give back some pages to the OS.
if(s->state == MSpanFree && s->unusedsince == 0)
s->unusedsince = runtime·nanotime();
if(s->state != MSpanInUse)
return;
arena_start = runtime·mheap.arena_start;
p = (byte*)(s->start << PageShift);
cl = s->sizeclass;
......@@ -847,16 +825,15 @@ sweepspan(MSpan *s)
void
runtime·gchelper(void)
{
// Wait until main proc is ready for mark help.
runtime·lock(&work.markgate);
runtime·unlock(&work.markgate);
scanblock(nil, 0);
// Wait until main proc is ready for sweep help.
runtime·lock(&work.sweepgate);
runtime·unlock(&work.sweepgate);
sweep();
if(DebugMark) {
// wait while the main thread executes mark(debug_scanblock)
while(runtime·atomicload(&work.debugmarkdone) == 0)
runtime·usleep(10);
}
runtime·parfordo(work.sweepfor);
if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
runtime·notewakeup(&work.alldone);
}
......@@ -972,33 +949,38 @@ runtime·gc(int32 force)
obj0 = mstats.nmalloc - mstats.nfree;
}
runtime·lock(&work.markgate);
runtime·lock(&work.sweepgate);
work.nwait = 0;
work.ndone = 0;
work.debugmarkdone = 0;
work.nproc = runtime·gcprocs();
if(work.sweepfor == nil)
work.sweepfor = runtime·parforalloc(MaxGcproc);
runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
if(work.nproc > 1) {
runtime·noteclear(&work.alldone);
runtime·helpgc(work.nproc);
}
work.nwait = 0;
work.ndone = 0;
runtime·unlock(&work.markgate); // let the helpers in
mark(scanblock);
if(DebugMark)
if(DebugMark) {
mark(debug_scanblock);
runtime·atomicstore(&work.debugmarkdone, 1);
}
t1 = runtime·nanotime();
work.spanidx = 0;
runtime·unlock(&work.sweepgate); // let the helpers in
sweep();
if(work.nproc > 1)
runtime·notesleep(&work.alldone);
runtime·parfordo(work.sweepfor);
t2 = runtime·nanotime();
stealcache();
cachestats(&stats);
if(work.nproc > 1)
runtime·notesleep(&work.alldone);
stats.nprocyield += work.sweepfor->nprocyield;
stats.nosyield += work.sweepfor->nosyield;
stats.nsleep += work.sweepfor->nsleep;
mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
m->gcing = 0;
......@@ -1027,20 +1009,21 @@ runtime·gc(int32 force)
if(gctrace) {
runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
" %D(%D) handoff, %D/%D/%D yields\n",
" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
heap0>>20, heap1>>20, obj0, obj1,
mstats.nmalloc, mstats.nfree,
stats.nhandoff, stats.nhandoffcnt,
work.sweepfor->nsteal, work.sweepfor->nstealcnt,
stats.nprocyield, stats.nosyield, stats.nsleep);
}
runtime·MProf_GC();
runtime·semrelease(&runtime·worldsema);
runtime·starttheworld();
// give the queued finalizers, if any, a chance to run
if(finq != nil)
// give the queued finalizers, if any, a chance to run
if(finq != nil)
runtime·gosched();
if(gctrace > 1 && !force)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment