Commit beedb1ec authored by Austin Clements's avatar Austin Clements

runtime: add pcvalue cache to improve stack scan speed

The cost of scanning large stacks is currently dominated by the time
spent looking up and decoding the pcvalue table. However, large stacks
are usually large not because they contain calls to many different
functions, but because they contain many calls to the same, small set
of recursive functions. Hence, walking large stacks tends to make the
same pcvalue queries many times.

Based on this observation, this commit adds a small, very simple, and
fast cache in front of pcvalue lookup. We thread this cache down from
operations that make many pcvalue calls, such as gentraceback, stack
scanning, and stack adjusting.

This simple cache works well because it has minimal overhead when it's
not effective. I also tried a hashed direct-map cache, CLOCK-based
replacement, round-robin replacement, and round-robin with lookups
disabled until there had been at least 16 probes, but none of these
approaches had obvious wins over the random replacement policy in this
commit.

This nearly doubles the overall performance of the deep stack test
program from issue #10898:

name        old time/op  new time/op  delta
Issue10898   16.5s ±12%    9.2s ±12%  -44.37%  (p=0.008 n=5+5)

It's a very slight win on the garbage benchmark:

name              old time/op  new time/op  delta
XBenchGarbage-12  4.92ms ± 1%  4.89ms ± 1%  -0.75%  (p=0.000 n=18+19)

It's a wash (but doesn't harm performance) on the go1 benchmarks,
which don't have particularly deep stacks:

name                      old time/op    new time/op    delta
BinaryTree17-12              3.11s ± 2%     3.20s ± 3%  +2.83%  (p=0.000 n=17+20)
Fannkuch11-12                2.51s ± 1%     2.51s ± 1%  -0.22%  (p=0.034 n=19+18)
FmtFprintfEmpty-12          50.8ns ± 3%    50.6ns ± 2%    ~     (p=0.793 n=20+20)
FmtFprintfString-12          174ns ± 0%     174ns ± 1%  +0.17%  (p=0.048 n=15+20)
FmtFprintfInt-12             177ns ± 0%     165ns ± 1%  -6.99%  (p=0.000 n=17+19)
FmtFprintfIntInt-12          283ns ± 1%     284ns ± 0%  +0.22%  (p=0.000 n=18+15)
FmtFprintfPrefixedInt-12     243ns ± 1%     244ns ± 1%  +0.40%  (p=0.000 n=20+19)
FmtFprintfFloat-12           318ns ± 0%     319ns ± 0%  +0.27%  (p=0.001 n=19+20)
FmtManyArgs-12              1.12µs ± 0%    1.14µs ± 0%  +1.74%  (p=0.000 n=19+20)
GobDecode-12                8.69ms ± 0%    8.73ms ± 1%  +0.46%  (p=0.000 n=18+18)
GobEncode-12                6.64ms ± 1%    6.61ms ± 1%  -0.46%  (p=0.000 n=20+20)
Gzip-12                      323ms ± 2%     319ms ± 1%  -1.11%  (p=0.000 n=20+20)
Gunzip-12                   42.8ms ± 0%    42.9ms ± 0%    ~     (p=0.158 n=18+20)
HTTPClientServer-12         63.3µs ± 1%    63.1µs ± 1%  -0.35%  (p=0.011 n=20+20)
JSONEncode-12               16.9ms ± 1%    17.3ms ± 1%  +2.84%  (p=0.000 n=19+20)
JSONDecode-12               59.7ms ± 0%    58.5ms ± 0%  -2.05%  (p=0.000 n=19+17)
Mandelbrot200-12            3.92ms ± 0%    3.91ms ± 0%  -0.16%  (p=0.003 n=19+19)
GoParse-12                  3.79ms ± 2%    3.75ms ± 2%  -0.91%  (p=0.005 n=20+20)
RegexpMatchEasy0_32-12       102ns ± 1%     101ns ± 1%  -0.80%  (p=0.001 n=14+20)
RegexpMatchEasy0_1K-12       337ns ± 1%     346ns ± 1%  +2.90%  (p=0.000 n=20+19)
RegexpMatchEasy1_32-12      84.4ns ± 2%    84.3ns ± 2%    ~     (p=0.743 n=20+20)
RegexpMatchEasy1_1K-12       502ns ± 1%     505ns ± 0%  +0.64%  (p=0.000 n=20+20)
RegexpMatchMedium_32-12      133ns ± 1%     132ns ± 1%  -0.85%  (p=0.000 n=20+19)
RegexpMatchMedium_1K-12     40.1µs ± 1%    39.8µs ± 1%  -0.77%  (p=0.000 n=18+18)
RegexpMatchHard_32-12       2.08µs ± 1%    2.07µs ± 1%  -0.55%  (p=0.001 n=18+19)
RegexpMatchHard_1K-12       62.4µs ± 1%    62.0µs ± 1%  -0.74%  (p=0.000 n=19+19)
Revcomp-12                   545ms ± 2%     545ms ± 3%    ~     (p=0.771 n=19+20)
Template-12                 73.7ms ± 1%    72.0ms ± 0%  -2.33%  (p=0.000 n=20+18)
TimeParse-12                 358ns ± 1%     351ns ± 1%  -2.07%  (p=0.000 n=20+20)
TimeFormat-12                369ns ± 1%     356ns ± 0%  -3.53%  (p=0.000 n=20+18)
[Geo mean]                  63.5µs         63.2µs       -0.41%

name                      old speed      new speed      delta
GobDecode-12              88.3MB/s ± 0%  87.9MB/s ± 0%  -0.43%  (p=0.000 n=18+17)
GobEncode-12               116MB/s ± 1%   116MB/s ± 1%  +0.47%  (p=0.000 n=20+20)
Gzip-12                   60.2MB/s ± 2%  60.8MB/s ± 1%  +1.13%  (p=0.000 n=20+20)
Gunzip-12                  453MB/s ± 0%   453MB/s ± 0%    ~     (p=0.160 n=18+20)
JSONEncode-12              115MB/s ± 1%   112MB/s ± 1%  -2.76%  (p=0.000 n=19+20)
JSONDecode-12             32.5MB/s ± 0%  33.2MB/s ± 0%  +2.09%  (p=0.000 n=19+17)
GoParse-12                15.3MB/s ± 2%  15.4MB/s ± 2%  +0.92%  (p=0.004 n=20+20)
RegexpMatchEasy0_32-12     311MB/s ± 1%   314MB/s ± 1%  +0.78%  (p=0.000 n=15+19)
RegexpMatchEasy0_1K-12    3.04GB/s ± 1%  2.95GB/s ± 1%  -2.90%  (p=0.000 n=19+19)
RegexpMatchEasy1_32-12     379MB/s ± 2%   380MB/s ± 2%    ~     (p=0.779 n=20+20)
RegexpMatchEasy1_1K-12    2.04GB/s ± 1%  2.02GB/s ± 0%  -0.62%  (p=0.000 n=20+20)
RegexpMatchMedium_32-12   7.46MB/s ± 1%  7.53MB/s ± 1%  +0.86%  (p=0.000 n=20+19)
RegexpMatchMedium_1K-12   25.5MB/s ± 1%  25.7MB/s ± 1%  +0.78%  (p=0.000 n=18+18)
RegexpMatchHard_32-12     15.4MB/s ± 1%  15.5MB/s ± 1%  +0.62%  (p=0.000 n=19+19)
RegexpMatchHard_1K-12     16.4MB/s ± 1%  16.5MB/s ± 1%  +0.82%  (p=0.000 n=20+19)
Revcomp-12                 466MB/s ± 2%   466MB/s ± 3%    ~     (p=0.765 n=19+20)
Template-12               26.3MB/s ± 1%  27.0MB/s ± 0%  +2.38%  (p=0.000 n=20+18)
[Geo mean]                97.8MB/s       98.0MB/s       +0.23%

Change-Id: I281044ae0b24990ba46487cacbc1069493274bc4
Reviewed-on: https://go-review.googlesource.com/13614Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent 46630ecd
......@@ -247,7 +247,7 @@ func dumpframe(s *stkframe, arg unsafe.Pointer) bool {
if pc != f.entry {
pc--
}
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc)
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc, nil)
if pcdata == -1 {
// We do not have a valid pcdata value but there might be a
// stackmap for this function. It is likely that we are looking
......
......@@ -1696,7 +1696,7 @@ func getgcmask(ep interface{}) (mask []byte) {
if targetpc != f.entry {
targetpc--
}
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
if pcdata == -1 {
return
}
......
......@@ -438,10 +438,11 @@ func scanstack(gp *g) {
throw("scanstack in wrong phase")
}
var cache pcvalueCache
gcw := &getg().m.p.ptr().gcw
n := 0
scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
scanframeworker(frame, unused, gcw)
scanframeworker(frame, &cache, gcw)
if frame.fp > nextBarrier {
// We skip installing a barrier on bottom-most
......@@ -474,7 +475,7 @@ func scanstack(gp *g) {
// Scan a stack frame: local variables and function arguments/results.
//go:nowritebarrier
func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
f := frame.fn
targetpc := frame.continpc
......@@ -488,7 +489,7 @@ func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
if targetpc != f.entry {
targetpc--
}
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
if pcdata == -1 {
// We do not have a valid pcdata value but there might be a
// stackmap for this function. It is likely that we are looking
......
......@@ -482,6 +482,7 @@ var ptrnames = []string{
type adjustinfo struct {
old stack
delta uintptr // ptr distance from old to new stack (newbase - oldbase)
cache pcvalueCache
}
// Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
......@@ -575,7 +576,7 @@ func adjustframe(frame *stkframe, arg unsafe.Pointer) bool {
if targetpc != f.entry {
targetpc--
}
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, &adjinfo.cache)
if pcdata == -1 {
pcdata = 0 // in prologue
}
......
......@@ -112,6 +112,7 @@ func moduledataverify1(datap *moduledata) {
// ftab is lookup table for function by program counter.
nftab := len(datap.ftab) - 1
var pcCache pcvalueCache
for i := 0; i < nftab; i++ {
// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
if datap.ftab[i].entry > datap.ftab[i+1].entry {
......@@ -147,9 +148,9 @@ func moduledataverify1(datap *moduledata) {
}
}
}
pcvalue(f, f.pcfile, end, true)
pcvalue(f, f.pcln, end, true)
pcvalue(f, f.pcsp, end, true)
pcvalue(f, f.pcfile, end, &pcCache, true)
pcvalue(f, f.pcln, end, &pcCache, true)
pcvalue(f, f.pcsp, end, &pcCache, true)
}
}
......@@ -226,10 +227,42 @@ func findfunc(pc uintptr) *_func {
return (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff]))
}
func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
type pcvalueCache struct {
entries [16]pcvalueCacheEnt
}
type pcvalueCacheEnt struct {
// targetpc and off together are the key of this cache entry.
targetpc uintptr
off int32
// val is the value of this cached pcvalue entry.
val int32
}
func pcvalue(f *_func, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
if off == 0 {
return -1
}
// Check the cache. This speeds up walks of deep stacks, which
// tend to have the same recursive functions over and over.
//
// This cache is small enough that full associativity is
// cheaper than doing the hashing for a less associative
// cache.
if cache != nil {
for _, ent := range cache.entries {
// We check off first because we're more
// likely to have multiple entries with
// different offsets for the same targetpc
// than the other way around, so we'll usually
// fail in the first clause.
if ent.off == off && ent.targetpc == targetpc {
return ent.val
}
}
}
datap := findmoduledatap(f.entry) // inefficient
if datap == nil {
if strict && panicking == 0 {
......@@ -248,6 +281,19 @@ func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
break
}
if targetpc < pc {
// Replace a random entry in the cache. Random
// replacement prevents a performance cliff if
// a recursive stack's cycle is slightly
// larger than the cache.
if cache != nil {
ci := fastrand1() % uint32(len(cache.entries))
cache.entries[ci] = pcvalueCacheEnt{
targetpc: targetpc,
off: off,
val: val,
}
}
return val
}
}
......@@ -296,8 +342,8 @@ func funcline1(f *_func, targetpc uintptr, strict bool) (file string, line int32
if datap == nil {
return "?", 0
}
fileno := int(pcvalue(f, f.pcfile, targetpc, strict))
line = pcvalue(f, f.pcln, targetpc, strict)
fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict))
line = pcvalue(f, f.pcln, targetpc, nil, strict)
if fileno == -1 || line == -1 || fileno >= len(datap.filetab) {
// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
return "?", 0
......@@ -310,20 +356,20 @@ func funcline(f *_func, targetpc uintptr) (file string, line int32) {
return funcline1(f, targetpc, true)
}
func funcspdelta(f *_func, targetpc uintptr) int32 {
x := pcvalue(f, f.pcsp, targetpc, true)
func funcspdelta(f *_func, targetpc uintptr, cache *pcvalueCache) int32 {
x := pcvalue(f, f.pcsp, targetpc, cache, true)
if x&(ptrSize-1) != 0 {
print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
}
return x
}
func pcdatavalue(f *_func, table int32, targetpc uintptr) int32 {
func pcdatavalue(f *_func, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
if table < 0 || table >= f.npcdata {
return -1
}
off := *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
return pcvalue(f, off, targetpc, true)
return pcvalue(f, off, targetpc, cache, true)
}
func funcdata(f *_func, i int32) unsafe.Pointer {
......
......@@ -198,6 +198,8 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
}
frame.fn = f
var cache pcvalueCache
n := 0
for n < max {
// Typically:
......@@ -219,7 +221,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
sp = gp.m.curg.sched.sp
stkbar = gp.m.curg.stkbar[gp.m.curg.stkbarPos:]
}
frame.fp = sp + uintptr(funcspdelta(f, frame.pc))
frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache))
if !usesLR {
// On x86, call instruction pushes return PC before entering new function.
frame.fp += regSize
......@@ -403,7 +405,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
frame.fn = f
if f == nil {
frame.pc = x
} else if funcspdelta(f, frame.pc) == 0 {
} else if funcspdelta(f, frame.pc, &cache) == 0 {
frame.lr = x
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment