Commit 025abd53 authored by Russ Cox's avatar Russ Cox

runtime: faster entersyscall, exitsyscall

Uses atomic memory accesses to avoid the need to acquire
and release schedlock on fast paths.

benchmark                            old ns/op    new ns/op    delta
runtime_test.BenchmarkSyscall               73           31  -56.63%
runtime_test.BenchmarkSyscall-2            538           74  -86.23%
runtime_test.BenchmarkSyscall-3            508          103  -79.72%
runtime_test.BenchmarkSyscall-4            721           97  -86.52%
runtime_test.BenchmarkSyscallWork          920          873   -5.11%
runtime_test.BenchmarkSyscallWork-2        516          481   -6.78%
runtime_test.BenchmarkSyscallWork-3        550          343  -37.64%
runtime_test.BenchmarkSyscallWork-4        632          263  -58.39%

(Intel Core i7 L640 2.13 GHz-based Lenovo X201s)

Reduced a less artificial server benchmark
from 11.5r 12.0u 8.0s to 8.3r 9.1u 1.0s.

R=dvyukov, r, bradfitz, r, iant, iant
CC=golang-dev
https://golang.org/cl/4723042
parent 9f636598
...@@ -15,3 +15,9 @@ var F32to64 = f32to64 ...@@ -15,3 +15,9 @@ var F32to64 = f32to64
var Fcmp64 = fcmp64 var Fcmp64 = fcmp64
var Fintto64 = fintto64 var Fintto64 = fintto64
var F64toint = f64toint var F64toint = f64toint
func entersyscall()
func exitsyscall()
var Entersyscall = entersyscall
var Exitsyscall = exitsyscall
This diff is collapsed.
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
model for proc.c as of 2011/07/15.
takes 4300 seconds to explore 1128130 states
with G=3, var_gomaxprocs=1
on a Core i7 L640 2.13 GHz Lenovo X201s.
rm -f proc.p.trail pan.* pan
spin -a proc.p
gcc -DSAFETY -DREACH -DMEMLIM'='4000 -o pan pan.c
pan -w28 -n -i -m500000
test -f proc.p.trail && pan -r proc.p.trail
*/
/*
* scheduling parameters
*/
/*
* the number of goroutines G doubles as the maximum
* number of OS threads; the max is reachable when all
* the goroutines are blocked in system calls.
*/
#define G 3
/*
* whether to allow gomaxprocs to vary during execution.
* enabling this checks the scheduler even when code is
* calling GOMAXPROCS, but it also slows down the verification
* by about 10x.
*/
#define var_gomaxprocs 1 /* allow gomaxprocs to vary */
/* gomaxprocs */
#if var_gomaxprocs
byte gomaxprocs = 3;
#else
#define gomaxprocs 3
#endif
/* queue of waiting M's: sched_mhead[:mwait] */
byte mwait;
byte sched_mhead[G];
/* garbage collector state */
bit gc_lock, gcwaiting;
/* goroutines sleeping, waiting to run */
byte gsleep, gwait;
/* scheduler state */
bit sched_lock;
bit sched_stopped;
bit atomic_gwaiting, atomic_waitstop;
byte atomic_mcpu, atomic_mcpumax;
/* M struct fields - state for handing off g to m. */
bit m_waitnextg[G];
bit m_havenextg[G];
bit m_nextg[G];
/*
* opt_atomic/opt_dstep mark atomic/deterministics
* sequences that are marked only for reasons of
* optimization, not for correctness of the algorithms.
*
* in general any code that runs while holding the
* schedlock and does not refer to or modify the atomic_*
* fields can be marked atomic/dstep without affecting
* the usefulness of the model. since we trust the lock
* implementation, what we really want to test is the
* interleaving of the atomic fast paths with entersyscall
* and exitsyscall.
*/
#define opt_atomic atomic
#define opt_dstep d_step
/* locks */
inline lock(x) {
d_step { x == 0; x = 1 }
}
inline unlock(x) {
d_step { assert x == 1; x = 0 }
}
/* notes */
inline noteclear(x) {
x = 0
}
inline notesleep(x) {
x == 1
}
inline notewakeup(x) {
opt_dstep { assert x == 0; x = 1 }
}
/*
* scheduler
*/
inline schedlock() {
lock(sched_lock)
}
inline schedunlock() {
unlock(sched_lock)
}
/*
* canaddmcpu is like the C function but takes
* an extra argument to include in the test, to model
* "cannget() && canaddmcpu()" as "canaddmcpu(cangget())"
*/
inline canaddmcpu(g) {
d_step {
g && atomic_mcpu < atomic_mcpumax;
atomic_mcpu++;
}
}
/*
* gput is like the C function.
* instead of tracking goroutines explicitly we
* maintain only the count of the number of
* waiting goroutines.
*/
inline gput() {
/* omitted: lockedm, idlem concerns */
opt_dstep {
gwait++;
if
:: gwait == 1 ->
atomic_gwaiting = 1
:: else
fi
}
}
/*
* cangget is a macro so it can be passed to
* canaddmcpu (see above).
*/
#define cangget() (gwait>0)
/*
* gget is like the C function.
*/
inline gget() {
opt_dstep {
assert gwait > 0;
gwait--;
if
:: gwait == 0 ->
atomic_gwaiting = 0
:: else
fi
}
}
/*
* mput is like the C function.
* here we do keep an explicit list of waiting M's,
* so that we know which ones can be awakened.
* we use _pid-1 because the monitor is proc 0.
*/
inline mput() {
opt_dstep {
sched_mhead[mwait] = _pid - 1;
mwait++
}
}
/*
* mnextg is like the C function mnextg(m, g).
* it passes an unspecified goroutine to m to start running.
*/
inline mnextg(m) {
opt_dstep {
m_nextg[m] = 1;
if
:: m_waitnextg[m] ->
m_waitnextg[m] = 0;
notewakeup(m_havenextg[m])
:: else
fi
}
}
/*
* mgetnextg handles the main m handoff in matchmg.
* it is like mget() || new M followed by mnextg(m, g),
* but combined to avoid a local variable.
* unlike the C code, a new M simply assumes it is
* running a g instead of using the mnextg coordination
* to obtain one.
*/
inline mgetnextg() {
opt_atomic {
if
:: mwait > 0 ->
mwait--;
mnextg(sched_mhead[mwait]);
sched_mhead[mwait] = 0;
:: else ->
run mstart();
fi
}
}
/*
* nextgandunlock is like the C function.
* it pulls a g off the queue or else waits for one.
*/
inline nextgandunlock() {
assert atomic_mcpu <= G;
if
:: m_nextg[_pid-1] ->
m_nextg[_pid-1] = 0;
schedunlock();
:: canaddmcpu(!m_nextg[_pid-1] && cangget()) ->
gget();
schedunlock();
:: else ->
opt_dstep {
mput();
m_nextg[_pid-1] = 0;
m_waitnextg[_pid-1] = 1;
noteclear(m_havenextg[_pid-1]);
}
if
:: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
atomic_waitstop = 0;
notewakeup(sched_stopped)
:: else
fi;
schedunlock();
opt_dstep {
notesleep(m_havenextg[_pid-1]);
assert m_nextg[_pid-1];
m_nextg[_pid-1] = 0;
}
fi
}
/*
* stoptheworld is like the C function.
*/
inline stoptheworld() {
schedlock();
gcwaiting = 1;
atomic_mcpumax = 1;
do
:: d_step { atomic_mcpu > 1 ->
noteclear(sched_stopped);
assert !atomic_waitstop;
atomic_waitstop = 1 }
schedunlock();
notesleep(sched_stopped);
schedlock();
:: else ->
break
od;
schedunlock();
}
/*
* starttheworld is like the C function.
*/
inline starttheworld() {
schedlock();
gcwaiting = 0;
atomic_mcpumax = gomaxprocs;
matchmg();
schedunlock();
}
/*
* matchmg is like the C function.
*/
inline matchmg() {
do
:: canaddmcpu(cangget()) ->
gget();
mgetnextg();
:: else -> break
od
}
/*
* ready is like the C function.
* it puts a g on the run queue.
*/
inline ready() {
schedlock();
gput()
matchmg()
schedunlock()
}
/*
* schedule simulates the C scheduler.
* it assumes that there is always a goroutine
* running already, and the goroutine has entered
* the scheduler for an unspecified reason,
* either to yield or to block.
*/
inline schedule() {
schedlock();
mustsched = 0;
atomic_mcpu--;
assert atomic_mcpu <= G;
if
:: skip ->
// goroutine yields, still runnable
gput();
:: gsleep+1 < G ->
// goroutine goes to sleep (but there is another that can wake it)
gsleep++
fi;
// Find goroutine to run.
nextgandunlock()
}
/*
* entersyscall is like the C function.
*/
inline entersyscall() {
/*
* Fast path. Check all the conditions tested during schedlock/schedunlock
* below, and if we can get through the whole thing without stopping, run it
* in one atomic cas-based step.
*/
atomic {
if
:: atomic_gwaiting ->
skip
:: atomic_waitstop && atomic_mcpu-1 <= atomic_mcpumax ->
skip
:: else ->
atomic_mcpu--;
goto Lreturn_entersyscall;
fi
}
/*
* Normal path.
*/
schedlock()
d_step {
atomic_mcpu--;
}
if
:: atomic_gwaiting ->
matchmg()
:: else
fi;
if
:: atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
atomic_waitstop = 0;
notewakeup(sched_stopped)
:: else
fi;
schedunlock();
Lreturn_entersyscall:
skip
}
/*
* exitsyscall is like the C function.
*/
inline exitsyscall() {
/*
* Fast path. If there's a cpu available, use it.
*/
atomic {
// omitted profilehz check
if
:: atomic_mcpu >= atomic_mcpumax ->
skip
:: else ->
atomic_mcpu++;
goto Lreturn_exitsyscall
fi
}
/*
* Normal path.
*/
schedlock();
d_step {
atomic_mcpu++;
if
:: atomic_mcpu <= atomic_mcpumax ->
skip
:: else ->
mustsched = 1
fi
}
schedunlock()
Lreturn_exitsyscall:
skip
}
#if var_gomaxprocs
inline gomaxprocsfunc() {
schedlock();
opt_atomic {
if
:: gomaxprocs != 1 -> gomaxprocs = 1
:: gomaxprocs != 2 -> gomaxprocs = 2
:: gomaxprocs != 3 -> gomaxprocs = 3
fi;
}
if
:: gcwaiting != 0 ->
assert atomic_mcpumax == 1
:: else ->
atomic_mcpumax = gomaxprocs;
if
:: atomic_mcpu > gomaxprocs ->
mustsched = 1
:: else ->
matchmg()
fi
fi;
schedunlock();
}
#endif
/*
* mstart is the entry point for a new M.
* our model of an M is always running some
* unspecified goroutine.
*/
proctype mstart() {
/*
* mustsched is true if the goroutine must enter the
* scheduler instead of continuing to execute.
*/
bit mustsched;
do
:: skip ->
// goroutine reschedules.
schedule()
:: !mustsched ->
// goroutine does something.
if
:: skip ->
// goroutine executes system call
entersyscall();
exitsyscall()
:: atomic { gsleep > 0; gsleep-- } ->
// goroutine wakes another goroutine
ready()
:: lock(gc_lock) ->
// goroutine runs a garbage collection
stoptheworld();
starttheworld();
unlock(gc_lock)
#if var_gomaxprocs
:: skip ->
// goroutine picks a new gomaxprocs
gomaxprocsfunc()
#endif
fi
od;
assert 0;
}
/*
* monitor initializes the scheduler state
* and then watches for impossible conditions.
*/
active proctype monitor() {
opt_dstep {
byte i = 1;
do
:: i < G ->
gput();
i++
:: else -> break
od;
atomic_mcpu = 1;
atomic_mcpumax = 1;
}
run mstart();
do
// Should never have goroutines waiting with procs available.
:: !sched_lock && gwait > 0 && atomic_mcpu < atomic_mcpumax ->
assert 0
// Should never have gc waiting for stop if things have already stopped.
:: !sched_lock && atomic_waitstop && atomic_mcpu <= atomic_mcpumax ->
assert 0
od
}
...@@ -73,3 +73,53 @@ func BenchmarkStackGrowth(b *testing.B) { ...@@ -73,3 +73,53 @@ func BenchmarkStackGrowth(b *testing.B) {
<-c <-c
} }
} }
func BenchmarkSyscall(b *testing.B) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
c := make(chan bool, procs)
for p := 0; p < procs; p++ {
go func() {
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
runtime.Entersyscall()
runtime.Exitsyscall()
}
}
c <- true
}()
}
for p := 0; p < procs; p++ {
<-c
}
}
func BenchmarkSyscallWork(b *testing.B) {
const CallsPerSched = 1000
const LocalWork = 100
procs := runtime.GOMAXPROCS(-1)
N := int32(b.N / CallsPerSched)
c := make(chan bool, procs)
for p := 0; p < procs; p++ {
go func() {
foo := 42
for atomic.AddInt32(&N, -1) >= 0 {
runtime.Gosched()
for g := 0; g < CallsPerSched; g++ {
runtime.Entersyscall()
for i := 0; i < LocalWork; i++ {
foo *= 2
foo /= 2
}
runtime.Exitsyscall()
}
}
c <- foo == 42
}()
}
for p := 0; p < procs; p++ {
<-c
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment