Commit 8dda1c4c authored by Rick Hudson's avatar Rick Hudson

[dev.garbage] runtime: remove heapBitsSweepSpan

Prior to this CL the sweep phase was responsible for locating
all objects that were about to be freed and calling a function
to process the object. This was done by the function
heapBitsSweepSpan. Part of processing included calls to
tracefree and msanfree as well as counting how many objects
were freed.

The calls to tracefree and msanfree have been moved into the
gcmalloc routine and called when the object is about to be
reallocated. The counting of free objects has been optimized
using an array based popcnt algorithm and if all the objects
in a span are free then span is freed.

Similarly the code to locate the next free object has been
optimized to use an array based ctz (count trailing zero).
Various hot paths in the allocation logic have been optimized.

At this point the garbage benchmark is within 3% of the 1.6
release.

Change-Id: I00643c442e2ada1685c010c3447e4ea8537d2dfa
Reviewed-on: https://go-review.googlesource.com/20201Reviewed-by: default avatarAustin Clements <austin@google.com>
parent 40934815
......@@ -496,6 +496,33 @@ const (
_FlagNoZero = 1 << 1 // don't zero memory
)
// nextFreeFast returns the next free object if one is quickly available.
// Otherwise it returns 0.
func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr {
s := c.alloc[sizeclass]
ctzIndex := uint8(s.allocCache & 0xff)
if ctzIndex != 0 {
theBit := uint64(ctzVals[ctzIndex])
freeidx := s.freeindex // help the pre ssa compiler out here with cse.
result := freeidx + uintptr(theBit)
if result < s.nelems {
s.allocCache >>= (theBit + 1)
freeidx = result + 1
if freeidx%64 == 0 && freeidx != s.nelems {
// We just incremented s.freeindex so it isn't 0
// so we are moving to the next aCache.
whichByte := freeidx / 8
s.refillAllocCache(whichByte)
}
s.freeindex = freeidx
v := gclinkptr(result*s.elemsize + s.base())
s.allocCount++
return v
}
}
return 0
}
// nextFree returns the next free object from the cached span if one is available.
// Otherwise it refills the cache with a span with an available object and
// returns that object along with a flag indicating that this was a heavy
......@@ -508,9 +535,9 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) {
freeIndex := s.nextFreeIndex()
if freeIndex == s.nelems {
// The span is full.
if uintptr(s.allocCount) > s.nelems {
if uintptr(s.allocCount) != s.nelems {
println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
throw("s.allocCount > s.nelems && freeIndex == s.nelems")
throw("s.allocCount != s.nelems && freeIndex == s.nelems")
}
systemstack(func() {
c.refill(int32(sizeclass))
......@@ -644,7 +671,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
}
// Allocate a new maxTinySize block.
var v gclinkptr
v, shouldhelpgc = c.nextFree(tinySizeClass)
v = c.nextFreeFast(tinySizeClass)
if v == 0 {
v, shouldhelpgc = c.nextFree(tinySizeClass)
}
x = unsafe.Pointer(v)
(*[2]uint64)(x)[0] = 0
(*[2]uint64)(x)[1] = 0
......@@ -664,7 +694,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
}
size = uintptr(class_to_size[sizeclass])
var v gclinkptr
v, shouldhelpgc = c.nextFree(sizeclass)
v = c.nextFreeFast(sizeclass)
if v == 0 {
v, shouldhelpgc = c.nextFree(sizeclass)
}
x = unsafe.Pointer(v)
if flags&flagNoZero == 0 {
memclr(unsafe.Pointer(v), size)
......@@ -725,9 +758,27 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
})
}
// The object x is about to be reused but tracefree and msanfree
// need to be informed.
// TODO:(rlh) It is quite possible that this object is being allocated
// out of a fresh span and that there is no preceding call to
// tracealloc with this object. If this is an issue then initialization
// of the fresh span needs to leave some crumbs around that can be used to
// avoid these calls. Furthermore these crumbs a likely the same as
// those needed to determine if the object needs to be zeroed.
// In the case of msanfree it does not make sense to call msanfree
// followed by msanmalloc. msanfree indicates that the bytes are not
// initialized but msanmalloc is about to indicate that they are.
// It makes no difference whether msanmalloc has been called on these
// bytes or not.
if debug.allocfreetrace != 0 {
tracefree(unsafe.Pointer(x), size)
}
if raceenabled {
racemalloc(x, size)
}
if msanenabled {
msanmalloc(x, size)
}
......
......@@ -186,17 +186,59 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex}
}
// ctzVals contains the count of trailing zeros for the
// index. 0 returns 8 indicating 8 zeros.
var ctzVals = [256]int8{
8, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
7, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
6, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
5, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0,
4, 0, 1, 0, 2, 0, 1, 0,
3, 0, 1, 0, 2, 0, 1, 0}
// A temporary stand in for the count trailing zero ctz instruction.
// IA bsf works on 64 bit non-zero word.
func ctz64(markBits uint64) uint64 {
if markBits == 0 {
ctz8 := ctzVals[markBits&0xff]
if ctz8 != 8 {
return uint64(ctz8)
} else if markBits == 0 { // low byte is zero check fill word.
return 64 // bits in 64 bit word, ensures loop terminates
}
// tz holds trailing zero count.
tz := uint64(0)
for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 {
result := uint64(8)
markBits >>= 8
for ctz8 = ctzVals[markBits&0xff]; ctz8 == 8; ctz8 = ctzVals[markBits&0xff] {
result += 8
markBits >>= 8
}
return tz
result += uint64(ctz8)
return result
}
// refillAllocCache takes 8 bytes s.allocBits starting at whichByte
......@@ -222,10 +264,12 @@ func (s *mspan) refillAllocCache(whichByte uintptr) {
// There are hardware instructions that can be used to make this
// faster if profiling warrants it.
func (s *mspan) nextFreeIndex() uintptr {
if s.freeindex == s.nelems {
return s.freeindex
sfreeindex := s.freeindex
snelems := s.nelems
if sfreeindex == snelems {
return sfreeindex
}
if s.freeindex > s.nelems {
if sfreeindex > snelems {
throw("s.freeindex > s.nelems")
}
......@@ -233,37 +277,37 @@ func (s *mspan) nextFreeIndex() uintptr {
bitIndex := ctz64(aCache)
for bitIndex == 64 {
// Move index to start of next cached bits.
s.freeindex = (s.freeindex + 64) &^ (64 - 1)
if s.freeindex >= s.nelems {
s.freeindex = s.nelems
return s.freeindex
sfreeindex = (sfreeindex + 64) &^ (64 - 1)
if sfreeindex >= snelems {
s.freeindex = snelems
return snelems
}
whichByte := s.freeindex / 8
whichByte := sfreeindex / 8
// Refill s.allocCache with the next 64 alloc bits.
// Unlike in allocBits a 1 in s.allocCache means
// the object is not marked.
s.refillAllocCache(whichByte)
aCache = s.allocCache
bitIndex = ctz64(aCache)
// Nothing was available try again now allocCache has been refilled.
}
result := s.freeindex + uintptr(bitIndex)
if result >= s.nelems {
s.freeindex = s.nelems
return s.freeindex
result := sfreeindex + uintptr(bitIndex)
if result >= snelems {
s.freeindex = snelems
return snelems
}
s.allocCache >>= bitIndex + 1
s.freeindex = result + 1
if s.freeindex%64 == 0 && s.freeindex != s.nelems {
s.allocCache >>= (bitIndex + 1)
sfreeindex = result + 1
if sfreeindex%64 == 0 && sfreeindex != snelems {
// We just incremented s.freeindex so it isn't 0.
// As each 1 in s.allocCache was encountered and used for allocation
// it was shifted away. At this point s.allocCache contains all 0s.
// Refill s.allocCache so that it corresponds
// to the bits at s.allocBits starting at s.freeindex.
whichByte := s.freeindex / 8
whichByte := sfreeindex / 8
s.refillAllocCache(whichByte)
}
s.freeindex = sfreeindex
return result
}
......@@ -760,120 +804,60 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
}
}
// heapBitsSweepSpan coordinates the sweeping of a span and inspects
// each freed object. If objects are being traced or if msan is enabled
// then heapBitsSweepSpan calls f(p), where p is the object's base address.
// When not tracing and msan is not enabled heapBitsSweepSpan is lightweight.
// heapBitsSweepSpan never alters the pointer/scalar heapBit maps. HeapBit map
// maintenance is the responsibility of the allocation routines.
// TODO:(rlh) Deal with the checkmark bits but moving them
// out of heap bitmap thus enabling bulk clearing.
func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) {
base := s.base()
size := s.elemsize
n := s.nelems
cl := s.sizeclass
doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0
h := heapBitsForSpan(base)
switch {
default:
throw("heapBitsSweepSpan")
case sys.PtrSize == 8 && size == sys.PtrSize:
nfree = heapBitsSweep8BitPtrs(h, s, base, n, cl, doCall, f)
case size%(4*sys.PtrSize) == 0:
nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f)
case size%(4*sys.PtrSize) == 2*sys.PtrSize:
nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f)
}
return
}
func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) {
mbits := s.markBitsForBase()
// Consider mark bits in all four 2-bit entries of each bitmap byte.
if cl == 0 {
throw("8BitPtrs are not in cl 0")
}
// Consider mark bits in all four 2-bit entries of each bitmap byte.
for i := uintptr(0); i < n; i++ {
// Note that unlike the other size cases, we leave the pointer bits set here.
// These are initialized during initSpan when the span is created and left
// in place the whole time the span is used for pointer-sized objects.
// That lets heapBitsSetType avoid an atomic update to set the pointer bit
// during allocation.
if !mbits.isMarked() {
nfree++
if mbits.index < s.freeindex {
f(base + i*sys.PtrSize)
} else if s.allocBits[mbits.index/8]&mbits.mask == 1 {
// it was marked in the previous cycle but not this cycle
// if it wasn't marked in the prvious cycle the call would be redundant.
f(base + i*sys.PtrSize)
}
}
mbits.advance()
}
return nfree
}
// nextFreed returns the next object that is being freed during this GC cycle.
// If the mark bit is set then the object is free. If it is < s.freeindex
// then either the object was freed during by this GC cycle.
// If it is >= freeindex then if the allocBit is set then it was
// freed during this GC cycle. If the allocBit is 0 it was freed
// during a previous cycle so is not considered a freed.
func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool {
mByte := *m.bytep
for {
for mByte == 0xff {
if m.index >= nelems {
return false
}
m.index = (m.index + 8) &^ (8 - 1)
m.mask = 1
m.bytep = add1(m.bytep)
mByte = *m.bytep
// Nothing free found totalFree remains the same.
}
if m.index >= nelems {
return false
}
for m.index < nelems {
if m.mask&mByte == 0 {
// At this point we have a free object so update totalFree
*totalFree++
if m.index < s.freeindex {
return true
}
if s.allocBits[m.index/8]&m.mask != 0 {
return true
}
}
if m.mask == 1<<7 {
m.mask = 1
m.bytep = add1(m.bytep)
mByte = *m.bytep
m.index++
break
} else {
m.mask = m.mask << 1
m.index++
}
}
}
return false
}
func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int {
totalFree := 0
twobits := s.markBitsForBase()
for twobits.nextFreed(n, s, &totalFree) {
if doCall {
f(base + twobits.index*size)
}
twobits.advance()
}
return totalFree
// oneBitCount is indexed by byte and produces the
// number of 1 bits in that byte. For example 128 has 1 bit set
// and oneBitCount[128] will holds 1.
var oneBitCount = [256]uint8{
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7,
5, 6, 6, 7, 6, 7, 7, 8}
// countFree runs through the mark bits in a span and counts the number of free objects
// in the span.
// TODO:(rlh) Use popcount intrinsic.
func (s *mspan) countFree() int {
count := 0
maxIndex := s.nelems / 8
for i := uintptr(0); i < maxIndex; i++ {
count += int(oneBitCount[s.gcmarkBits[i]])
}
if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
markBits := uint8(s.gcmarkBits[maxIndex])
mask := uint8((1 << bitsInLastByte) - 1)
bits := markBits & mask
count += int(oneBitCount[bits])
}
return int(s.nelems) - count
}
// heapBitsSetType records that the new allocation [x, x+size)
......
......@@ -8,7 +8,6 @@ package runtime
import (
"runtime/internal/atomic"
"runtime/internal/sys"
"unsafe"
)
......@@ -252,40 +251,13 @@ func (s *mspan) sweep(preserve bool) bool {
}
}
// Sweep through n objects of given size starting at p.
// This thread owns the span now, so it can manipulate
// the block bitmap without atomic operations.
nfree = heapBitsSweepSpan(s, func(p uintptr) {
// At this point we know that we are looking at a garbage object
// that needs to be collected.
if debug.allocfreetrace != 0 {
tracefree(unsafe.Pointer(p), size)
}
if msanenabled {
msanfree(unsafe.Pointer(p), size)
}
// Reset to allocated+noscan.
if cl == 0 {
// Free large span.
if preserve {
throw("can't preserve large span")
}
s.needzero = 1
// Count the number of free objects in this span.
nfree = s.countFree()
if cl == 0 && nfree != 0 {
s.needzero = 1
freeToHeap = true
}
// Free the span after heapBitsSweepSpan
// returns, since it's not done with the span.
freeToHeap = true
} else {
// Free small object.
if size > 2*sys.PtrSize {
*(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
} else if size > sys.PtrSize {
*(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = 0
}
}
})
s.allocCount = uint16(s.nelems) - uint16(nfree)
wasempty := s.nextFreeIndex() == s.nelems
......
......@@ -137,6 +137,9 @@ type mspan struct {
//
// Object n starts at address n*elemsize + (start << pageShift).
freeindex uintptr
// TODO: Look up nelems from sizeclass and remove this field if it
// helps performance.
nelems uintptr // number of object in the span.
// Cache of the allocBits at freeindex. allocCache is shifted
// such that the lowest bit corresponds to the bit freeindex.
......@@ -147,9 +150,6 @@ type mspan struct {
allocCache uint64
allocBits *[maxObjsPerSpan / 8]uint8
gcmarkBits *[maxObjsPerSpan / 8]uint8
nelems uintptr // number of object in the span.
// TODO(rlh) consider moving some of these fields into seperate arrays.
// Put another way is an array of structs a better idea than a struct of arrays.
// allocBits and gcmarkBits currently point to either markbits1
// or markbits2. At the end of a GC cycle allocBits and
......@@ -753,6 +753,12 @@ func (h *mheap) freeSpan(s *mspan, acct int32) {
mp.mcache.local_scan = 0
memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
mp.mcache.local_tinyallocs = 0
if msanenabled {
// Tell msan that this entire span is no longer in use.
base := unsafe.Pointer(s.base())
bytes := s.npages << _PageShift
msanfree(base, bytes)
}
if acct != 0 {
memstats.heap_objects--
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment