runtime: faster GC sweep phase

benchmark old ns/op new ns/op delta garbage.BenchmarkParser 3731065750 3715543750 -0.41% garbage.BenchmarkParser-2 3631299750 3495248500 -3.75% garbage.BenchmarkParser-4 3386486000 3339353000 -1.39% garbage.BenchmarkParser-8 3267632000 3286422500 +0.58% garbage.BenchmarkParser-16 3299203000 3316081750 +0.51% garbage.BenchmarkTree 977532888 919453833 -5.94% garbage.BenchmarkTree-2 919948555 853478000 -7.23% garbage.BenchmarkTree-4 841329000 790207000 -6.08% garbage.BenchmarkTree-8 787792777 740380666 -6.01% garbage.BenchmarkTree-16 899257166 846594555 -5.86% garbage.BenchmarkTree2 574876300 571885800 -0.52% garbage.BenchmarkTree2-2 348162700 345888900 -0.65% garbage.BenchmarkTree2-4 184912500 179137000 -3.22% garbage.BenchmarkTree2-8 104243900 103485600 -0.73% garbage.BenchmarkTree2-16 97269500 85137100 -14.25% garbage.BenchmarkParserPause 141101976 157746974 +11.80% garbage.BenchmarkParserPause-2 103096051 83043048 -19.45% garbage.BenchmarkParserPause-4 52153133 45951111 -11.89% garbage.BenchmarkParserPause-8 36730190 38901024 +5.91% garbage.BenchmarkParserPause-16 32678875 29578585 -9.49% garbage.BenchmarkTreePause 29487065 29648439 +0.55% garbage.BenchmarkTreePause-2 22443494 21306159 -5.07% garbage.BenchmarkTreePause-4 15799691 14985647 -5.15% garbage.BenchmarkTreePause-8 10768112 9531420 -12.97% garbage.BenchmarkTreePause-16 16329891 15205158 -6.89% garbage.BenchmarkTree2Pause 2586957240 2577533200 -0.36% garbage.BenchmarkTree2Pause-2 1683383760 1673923800 -0.56% garbage.BenchmarkTree2Pause-4 1102860320 1074040280 -2.68% garbage.BenchmarkTree2Pause-8 902627920 886122400 -1.86% garbage.BenchmarkTree2Pause-16 856470920 804152320 -6.50% garbage.BenchmarkParserLastPause 277316000 280839000 +1.25% garbage.BenchmarkParserLastPause-2 179446000 163687000 -8.78% garbage.BenchmarkParserLastPause-4 106752000 94144000 -11.81% garbage.BenchmarkParserLastPause-8 57758000 61640000 +6.72% garbage.BenchmarkParserLastPause-16 51235000 42552000 -16.95% garbage.BenchmarkTreeLastPause 45244000 50786000 +12.25% garbage.BenchmarkTreeLastPause-2 37163000 34654000 -6.75% garbage.BenchmarkTreeLastPause-4 24178000 21967000 -9.14% garbage.BenchmarkTreeLastPause-8 20390000 15648000 -30.30% garbage.BenchmarkTreeLastPause-16 22398000 20180000 -9.90% garbage.BenchmarkTree2LastPause 5748706000 5718809000 -0.52% garbage.BenchmarkTree2LastPause-2 3481570000 3458844000 -0.65% garbage.BenchmarkTree2LastPause-4 1849073000 1791330000 -3.22% garbage.BenchmarkTree2LastPause-8 1042375000 1034811000 -0.73% garbage.BenchmarkTree2LastPause-16 972637000 851323000 -14.25% There is also visible improvement in consumed CPU time: tree2 -heapsize=8000000000 -cpus=12 before: 248.74user 6.36system 0:52.74elapsed 483%CPU after: 229.86user 6.33system 0:51.08elapsed 462%CPU -1.66s of real time, but -18.91s of consumed CPU time R=golang-dev CC=golang-dev https://golang.org/cl/6215065

runtime: faster GC sweep phase
benchmark old ns/op new ns/op delta garbage.BenchmarkParser 3731065750 3715543750 -0.41% garbage.BenchmarkParser-2 3631299750 3495248500 -3.75% garbage.BenchmarkParser-4 3386486000 3339353000 -1.39% garbage.BenchmarkParser-8 3267632000 3286422500 +0.58% garbage.BenchmarkParser-16 3299203000 3316081750 +0.51% garbage.BenchmarkTree 977532888 919453833 -5.94% garbage.BenchmarkTree-2 919948555 853478000 -7.23% garbage.BenchmarkTree-4 841329000 790207000 -6.08% garbage.BenchmarkTree-8 787792777 740380666 -6.01% garbage.BenchmarkTree-16 899257166 846594555 -5.86% garbage.BenchmarkTree2 574876300 571885800 -0.52% garbage.BenchmarkTree2-2 348162700 345888900 -0.65% garbage.BenchmarkTree2-4 184912500 179137000 -3.22% garbage.BenchmarkTree2-8 104243900 103485600 -0.73% garbage.BenchmarkTree2-16 97269500 85137100 -14.25% garbage.BenchmarkParserPause 141101976 157746974 +11.80% garbage.BenchmarkParserPause-2 103096051 83043048 -19.45% garbage.BenchmarkParserPause-4 52153133 45951111 -11.89% garbage.BenchmarkParserPause-8 36730190 38901024 +5.91% garbage.BenchmarkParserPause-16 32678875 29578585 -9.49% garbage.BenchmarkTreePause 29487065 29648439 +0.55% garbage.BenchmarkTreePause-2 22443494 21306159 -5.07% garbage.BenchmarkTreePause-4 15799691 14985647 -5.15% garbage.BenchmarkTreePause-8 10768112 9531420 -12.97% garbage.BenchmarkTreePause-16 16329891 15205158 -6.89% garbage.BenchmarkTree2Pause 2586957240 2577533200 -0.36% garbage.BenchmarkTree2Pause-2 1683383760 1673923800 -0.56% garbage.BenchmarkTree2Pause-4 1102860320 1074040280 -2.68% garbage.BenchmarkTree2Pause-8 902627920 886122400 -1.86% garbage.BenchmarkTree2Pause-16 856470920 804152320 -6.50% garbage.BenchmarkParserLastPause 277316000 280839000 +1.25% garbage.BenchmarkParserLastPause-2 179446000 163687000 -8.78% garbage.BenchmarkParserLastPause-4 106752000 94144000 -11.81% garbage.BenchmarkParserLastPause-8 57758000 61640000 +6.72% garbage.BenchmarkParserLastPause-16 51235000 42552000 -16.95% garbage.BenchmarkTreeLastPause 45244000 50786000 +12.25% garbage.BenchmarkTreeLastPause-2 37163000 34654000 -6.75% garbage.BenchmarkTreeLastPause-4 24178000 21967000 -9.14% garbage.BenchmarkTreeLastPause-8 20390000 15648000 -30.30% garbage.BenchmarkTreeLastPause-16 22398000 20180000 -9.90% garbage.BenchmarkTree2LastPause 5748706000 5718809000 -0.52% garbage.BenchmarkTree2LastPause-2 3481570000 3458844000 -0.65% garbage.BenchmarkTree2LastPause-4 1849073000 1791330000 -3.22% garbage.BenchmarkTree2LastPause-8 1042375000 1034811000 -0.73% garbage.BenchmarkTree2LastPause-16 972637000 851323000 -14.25% There is also visible improvement in consumed CPU time: tree2 -heapsize=8000000000 -cpus=12 before: 248.74user 6.36system 0:52.74elapsed 483%CPU after: 229.86user 6.33system 0:51.08elapsed 462%CPU -1.66s of real time, but -18.91s of consumed CPU time R=golang-dev CC=golang-dev https://golang.org/cl/6215065
845aa1fc · Dmitriy Vyukov · Russ Cox · 581e7c2a · 845aa1fc
Commit 845aa1fc authored May 22, 2012 by Dmitriy Vyukov Committed by Russ Cox May 22, 2012
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 57 deletions

src/pkg/runtime/mgc0.c src/pkg/runtime/mgc0.c +40 -57

No files found.
--- a/src/pkg/runtime/mgc0.c
+++ b/src/pkg/runtime/mgc0.c
@@ -120,10 +120,9 @@ static struct {
 	uint32	nproc;
 	volatile uint32	nwait;
 	volatile uint32	ndone;
+	volatile uint32 debugmarkdone;
 	Note	alldone;
-	Lock	markgate;
-	Lock	sweepgate;
-	uint32	spanidx;
+	ParFor	*sweepfor;

 	Lock;
 	byte	*chunk;
@@ -720,40 +719,10 @@ handlespecial(byte *p, uintptr size)
 	return true;
 }

-static void sweepspan(MSpan *s);
-
 // Sweep frees or collects finalizers for blocks not marked in the mark phase.
 // It clears the mark bits in preparation for the next GC round.
 static void
-sweep(void)
-{
-	MSpan *s, **allspans;
-	int64 now;
-	uint32 spanidx, nspan;
-
-	now = runtime·nanotime();
-	nspan = runtime·mheap.nspan;
-	allspans = runtime·mheap.allspans;
-	for(;;) {
-		spanidx = runtime·xadd(&work.spanidx, 1) - 1;
-		if(spanidx >= nspan)
-			break;
-		s = allspans[spanidx];
-
-		// Stamp newly unused spans. The scavenger will use that
-		// info to potentially give back some pages to the OS.
-		if(s->state == MSpanFree && s->unusedsince == 0)
-			s->unusedsince = now;
-
-		if(s->state != MSpanInUse)
-			continue;
-
-		sweepspan(s);
-	}
-}
-
-static void
-sweepspan(MSpan *s)
+sweepspan(ParFor *desc, uint32 idx)
 {
 	int32 cl, n, npages;
 	uintptr size;
@@ -762,7 +731,16 @@ sweepspan(MSpan *s)
 	byte *arena_start;
 	MLink *start, *end;
 	int32 nfree;
+	MSpan *s;

+	USED(&desc);
+	s = runtime·mheap.allspans[idx];
+	// Stamp newly unused spans. The scavenger will use that
+	// info to potentially give back some pages to the OS.
+	if(s->state == MSpanFree && s->unusedsince == 0)
+		s->unusedsince = runtime·nanotime();
+	if(s->state != MSpanInUse)
+		return;
 	arena_start = runtime·mheap.arena_start;
 	p = (byte*)(s->start << PageShift);
 	cl = s->sizeclass;
@@ -847,16 +825,15 @@ sweepspan(MSpan *s)
 void
 runtime·gchelper(void)
 {
-	// Wait until main proc is ready for mark help.
-	runtime·lock(&work.markgate);
-	runtime·unlock(&work.markgate);
 	scanblock(nil, 0);

-	// Wait until main proc is ready for sweep help.
-	runtime·lock(&work.sweepgate);
-	runtime·unlock(&work.sweepgate);
-	sweep();
+	if(DebugMark) {
+		// wait while the main thread executes mark(debug_scanblock)
+		while(runtime·atomicload(&work.debugmarkdone) == 0)
+			runtime·usleep(10);
+	}

+	runtime·parfordo(work.sweepfor);
 	if(runtime·xadd(&work.ndone, +1) == work.nproc-1)
 		runtime·notewakeup(&work.alldone);
 }
@@ -972,33 +949,38 @@ runtime·gc(int32 force)
 		obj0 = mstats.nmalloc - mstats.nfree;
 	}

-	runtime·lock(&work.markgate);
-	runtime·lock(&work.sweepgate);
-
+	work.nwait = 0;
+	work.ndone = 0;
+	work.debugmarkdone = 0;
 	work.nproc = runtime·gcprocs();
+	if(work.sweepfor == nil)
+		work.sweepfor = runtime·parforalloc(MaxGcproc);
+	runtime·parforsetup(work.sweepfor, work.nproc, runtime·mheap.nspan, nil, true, sweepspan);
 	if(work.nproc > 1) {
 		runtime·noteclear(&work.alldone);
 		runtime·helpgc(work.nproc);
 	}
-	work.nwait = 0;
-	work.ndone = 0;

-	runtime·unlock(&work.markgate);  // let the helpers in
 	mark(scanblock);
-	if(DebugMark)
+	if(DebugMark) {
 		mark(debug_scanblock);
+		runtime·atomicstore(&work.debugmarkdone, 1);
+	}
 	t1 = runtime·nanotime();

-	work.spanidx = 0;
-	runtime·unlock(&work.sweepgate);  // let the helpers in
-	sweep();
-	if(work.nproc > 1)
-		runtime·notesleep(&work.alldone);
+	runtime·parfordo(work.sweepfor);
 	t2 = runtime·nanotime();

 	stealcache();
 	cachestats(&stats);

+	if(work.nproc > 1)
+		runtime·notesleep(&work.alldone);
+
+	stats.nprocyield += work.sweepfor->nprocyield;
+	stats.nosyield += work.sweepfor->nosyield;
+	stats.nsleep += work.sweepfor->nsleep;
+
 	mstats.next_gc = mstats.heap_alloc+mstats.heap_alloc*gcpercent/100;
 	m->gcing = 0;

@@ -1027,20 +1009,21 @@ runtime·gc(int32 force)

 	if(gctrace) {
 		runtime·printf("gc%d(%d): %D+%D+%D ms, %D -> %D MB %D -> %D (%D-%D) objects,"
-				" %D(%D) handoff, %D/%D/%D yields\n",
+				" %D(%D) handoff, %D(%D) steal, %D/%D/%D yields\n",
 			mstats.numgc, work.nproc, (t1-t0)/1000000, (t2-t1)/1000000, (t3-t2)/1000000,
 			heap0>>20, heap1>>20, obj0, obj1,
 			mstats.nmalloc, mstats.nfree,
 			stats.nhandoff, stats.nhandoffcnt,
+			work.sweepfor->nsteal, work.sweepfor->nstealcnt,
 			stats.nprocyield, stats.nosyield, stats.nsleep);
 	}
-	
+
 	runtime·MProf_GC();
 	runtime·semrelease(&runtime·worldsema);
 	runtime·starttheworld();

-	// give the queued finalizers, if any, a chance to run	
-	if(finq != nil)	
+	// give the queued finalizers, if any, a chance to run
+	if(finq != nil)
 		runtime·gosched();

 	if(gctrace > 1 && !force)