Commit 3b246fa8 authored by Dmitry Vyukov's avatar Dmitry Vyukov

runtime: sleep less when we can do work

Usleep(100) in runqgrab negatively affects latency and throughput
of parallel application. We are sleeping instead of doing useful work.
This is effect is particularly visible on windows where minimal
sleep duration is 1-15ms.

Reduce sleep from 100us to 3us and use osyield on windows.
Sync chan send/recv takes ~50ns, so 3us gives us ~50x overshoot.

benchmark                    old ns/op     new ns/op     delta
BenchmarkChanSync-12         216           217           +0.46%
BenchmarkChanSyncWork-12     27213         25816         -5.13%

CPU consumption goes up from 106% to 108% in the first case,
and from 107% to 125% in the second case.

Test case from #14790 on windows:

BenchmarkDefaultResolution-8  4583372   29720    -99.35%
Benchmark1ms-8                992056    30701    -96.91%

99-th latency percentile for HTTP request serving is improved by up to 15%
(see http://golang.org/cl/20835 for details).

The following benchmarks are from the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):

name        old time/op  new time/op  delta
Chain       22.6µs ± 2%  22.7µs ± 6%    ~      (p=0.905 n=9+10)
ChainBuf    22.4µs ± 3%  22.5µs ± 4%    ~      (p=0.780 n=9+10)
Chain-2     23.5µs ± 4%  24.9µs ± 1%  +5.66%   (p=0.000 n=10+9)
ChainBuf-2  23.7µs ± 1%  24.4µs ± 1%  +3.31%   (p=0.000 n=9+10)
Chain-4     24.2µs ± 2%  25.1µs ± 3%  +3.70%   (p=0.000 n=9+10)
ChainBuf-4  24.4µs ± 5%  25.0µs ± 2%  +2.37%  (p=0.023 n=10+10)
Powser       2.37s ± 1%   2.37s ± 1%    ~       (p=0.423 n=8+9)
Powser-2     2.48s ± 2%   2.57s ± 2%  +3.74%   (p=0.000 n=10+9)
Powser-4     2.66s ± 1%   2.75s ± 1%  +3.40%  (p=0.000 n=10+10)
Sieve        13.3s ± 2%   13.3s ± 2%    ~      (p=1.000 n=10+9)
Sieve-2      7.00s ± 2%   7.44s ±16%    ~      (p=0.408 n=8+10)
Sieve-4      4.13s ±21%   3.85s ±22%    ~       (p=0.113 n=9+9)

Fixes #14790

Change-Id: Ie7c6a1c4f9c8eb2f5d65ab127a3845386d6f8b5d
Reviewed-on: https://go-review.googlesource.com/20835Reviewed-by: default avatarAustin Clements <austin@google.com>
parent 036d09d5
...@@ -777,7 +777,7 @@ func BenchmarkChanContended(b *testing.B) { ...@@ -777,7 +777,7 @@ func BenchmarkChanContended(b *testing.B) {
}) })
} }
func BenchmarkChanSync(b *testing.B) { func benchmarkChanSync(b *testing.B, work int) {
const CallsPerSched = 1000 const CallsPerSched = 1000
procs := 2 procs := 2
N := int32(b.N / CallsPerSched / procs * procs) N := int32(b.N / CallsPerSched / procs * procs)
...@@ -793,10 +793,14 @@ func BenchmarkChanSync(b *testing.B) { ...@@ -793,10 +793,14 @@ func BenchmarkChanSync(b *testing.B) {
for g := 0; g < CallsPerSched; g++ { for g := 0; g < CallsPerSched; g++ {
if i%2 == 0 { if i%2 == 0 {
<-myc <-myc
localWork(work)
myc <- 0 myc <- 0
localWork(work)
} else { } else {
myc <- 0 myc <- 0
localWork(work)
<-myc <-myc
localWork(work)
} }
} }
} }
...@@ -808,6 +812,14 @@ func BenchmarkChanSync(b *testing.B) { ...@@ -808,6 +812,14 @@ func BenchmarkChanSync(b *testing.B) {
} }
} }
func BenchmarkChanSync(b *testing.B) {
benchmarkChanSync(b, 0)
}
func BenchmarkChanSyncWork(b *testing.B) {
benchmarkChanSync(b, 1000)
}
func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) { func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
const CallsPerSched = 1000 const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1) procs := runtime.GOMAXPROCS(-1)
...@@ -981,3 +993,18 @@ func BenchmarkChanPopular(b *testing.B) { ...@@ -981,3 +993,18 @@ func BenchmarkChanPopular(b *testing.B) {
} }
wg.Wait() wg.Wait()
} }
var (
alwaysFalse = false
workSink = 0
)
func localWork(w int) {
foo := 0
for i := 0; i < w; i++ {
foo /= (foo + 1)
}
if alwaysFalse {
workSink += foo
}
}
...@@ -3999,7 +3999,16 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool ...@@ -3999,7 +3999,16 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
// Instead of stealing runnext in this window, back off // Instead of stealing runnext in this window, back off
// to give _p_ a chance to schedule runnext. This will avoid // to give _p_ a chance to schedule runnext. This will avoid
// thrashing gs between different Ps. // thrashing gs between different Ps.
usleep(100) // A sync chan send/recv takes ~50ns as of time of writing,
// so 3us gives ~50x overshoot.
if GOOS != "windows" {
usleep(3)
} else {
// On windows system timer granularity is 1-15ms,
// which is way too much for this optimization.
// So just yield.
osyield()
}
if !_p_.runnext.cas(next, 0) { if !_p_.runnext.cas(next, 0) {
continue continue
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment