runtime: reorganize memory code

Move code from malloc1.go, malloc2.go, mem.go, mgc0.go into appropriate locations. Factor mgc.go into mgc.go, mgcmark.go, mgcsweep.go, mstats.go. A lot of this code was in certain files because the right place was in a C file but it was written in Go, or vice versa. This is one step toward making things actually well-organized again. Change-Id: I6741deb88a7cfb1c17ffe0bcca3989e10207968f Reviewed-on: https://go-review.googlesource.com/5300Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Rick Hudson <rlh@golang.org>

runtime: reorganize memory code
Move code from malloc1.go, malloc2.go, mem.go, mgc0.go into appropriate locations. Factor mgc.go into mgc.go, mgcmark.go, mgcsweep.go, mstats.go. A lot of this code was in certain files because the right place was in a C file but it was written in Go, or vice versa. This is one step toward making things actually well-organized again. Change-Id: I6741deb88a7cfb1c17ffe0bcca3989e10207968f Reviewed-on: https://go-review.googlesource.com/5300Reviewed-by: Austin Clements <austin@google.com> Reviewed-by: Rick Hudson <rlh@golang.org>
484f801f · Russ Cox · d384545a · 484f801f · 484f801f · d384545a
Commit 484f801f authored Feb 19, 2015 by Russ Cox
19 changed files
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -13,6 +13,24 @@ package runtime

 import "unsafe"

+//go:linkname runtime_debug_WriteHeapDump runtime/debug.WriteHeapDump
+func runtime_debug_WriteHeapDump(fd uintptr) {
+	semacquire(&worldsema, false)
+	gp := getg()
+	gp.m.preemptoff = "write heap dump"
+	systemstack(stoptheworld)
+
+	systemstack(func() {
+		writeheapdump_m(fd)
+	})
+
+	gp.m.preemptoff = ""
+	gp.m.locks++
+	semrelease(&worldsema)
+	systemstack(starttheworld)
+	gp.m.locks--
+}
+
 const (
 	fieldKindEol       = 0
 	fieldKindPtr       = 1

--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -2,6 +2,84 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+// Memory allocator, based on tcmalloc.
+// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
+
+// The main allocator works in runs of pages.
+// Small allocation sizes (up to and including 32 kB) are
+// rounded to one of about 100 size classes, each of which
+// has its own free list of objects of exactly that size.
+// Any free page of memory can be split into a set of objects
+// of one size class, which are then managed using free list
+// allocators.
+//
+// The allocator's data structures are:
+//
+//	FixAlloc: a free-list allocator for fixed-size objects,
+//		used to manage storage used by the allocator.
+//	MHeap: the malloc heap, managed at page (4096-byte) granularity.
+//	MSpan: a run of pages managed by the MHeap.
+//	MCentral: a shared free list for a given size class.
+//	MCache: a per-thread (in Go, per-P) cache for small objects.
+//	MStats: allocation statistics.
+//
+// Allocating a small object proceeds up a hierarchy of caches:
+//
+//	1. Round the size up to one of the small size classes
+//	   and look in the corresponding MCache free list.
+//	   If the list is not empty, allocate an object from it.
+//	   This can all be done without acquiring a lock.
+//
+//	2. If the MCache free list is empty, replenish it by
+//	   taking a bunch of objects from the MCentral free list.
+//	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
+//
+//	3. If the MCentral free list is empty, replenish it by
+//	   allocating a run of pages from the MHeap and then
+//	   chopping that memory into objects of the given size.
+//	   Allocating many objects amortizes the cost of locking
+//	   the heap.
+//
+//	4. If the MHeap is empty or has no page runs large enough,
+//	   allocate a new group of pages (at least 1MB) from the
+//	   operating system.  Allocating a large run of pages
+//	   amortizes the cost of talking to the operating system.
+//
+// Freeing a small object proceeds up the same hierarchy:
+//
+//	1. Look up the size class for the object and add it to
+//	   the MCache free list.
+//
+//	2. If the MCache free list is too long or the MCache has
+//	   too much memory, return some to the MCentral free lists.
+//
+//	3. If all the objects in a given span have returned to
+//	   the MCentral list, return that span to the page heap.
+//
+//	4. If the heap has too much memory, return some to the
+//	   operating system.
+//
+//	TODO(rsc): Step 4 is not implemented.
+//
+// Allocating and freeing a large object uses the page heap
+// directly, bypassing the MCache and MCentral free lists.
+//
+// The small objects on the MCache and MCentral free lists
+// may or may not be zeroed.  They are zeroed if and only if
+// the second word of the object is zero.  A span in the
+// page heap is zeroed unless s->needzero is set. When a span
+// is allocated to break into small objects, it is zeroed if needed
+// and s->needzero is set. There are two main benefits to delaying the
+// zeroing this way:
+//
+//	1. stack frames allocated from the small object lists
+//	   or the page heap can avoid zeroing altogether.
+//	2. the cost of zeroing when reusing a small object is
+//	   charged to the mutator, not the garbage collector.
+//
+// This code was written with an eye toward translating to Go
+// in the future.  Methods have the form Type_Method(Type *t, ...).
+
 package runtime

 import "unsafe"
@@ -25,29 +103,369 @@ const (
 	concurrentSweep = _ConcurrentSweep
 )

+const (
+	_PageShift = 13
+	_PageSize  = 1 << _PageShift
+	_PageMask  = _PageSize - 1
+)
+
+const (
+	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
+	_64bit = 1 << (^uintptr(0) >> 63) / 2
+
+	// Computed constant.  The definition of MaxSmallSize and the
+	// algorithm in msize.c produce some number of different allocation
+	// size classes.  NumSizeClasses is that number.  It's needed here
+	// because there are static arrays of this length; when msize runs its
+	// size choosing algorithm it double-checks that NumSizeClasses agrees.
+	_NumSizeClasses = 67
+
+	// Tunable constants.
+	_MaxSmallSize = 32 << 10
+
+	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
+	_TinySize      = 16
+	_TinySizeClass = 2
+
+	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
+	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
+	_HeapAllocChunk = 1 << 20                // Chunk size for heap growth
+
+	// Per-P, per order stack segment cache size.
+	_StackCacheSize = 32 * 1024
+
+	// Number of orders that get caching.  Order 0 is FixedStack
+	// and each successive order is twice as large.
+	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks.  Larger stacks
+	// will be allocated directly.
+	// Since FixedStack is different on different systems, we
+	// must vary NumStackOrders to keep the same maximum cached size.
+	//   OS               | FixedStack | NumStackOrders
+	//   -----------------+------------+---------------
+	//   linux/darwin/bsd | 2KB        | 4
+	//   windows/32       | 4KB        | 3
+	//   windows/64       | 8KB        | 2
+	//   plan9            | 4KB        | 3
+	_NumStackOrders = 4 - ptrSize/4*goos_windows - 1*goos_plan9
+
+	// Number of bits in page to span calculations (4k pages).
+	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
+	// Windows counts memory used by page table into committed memory
+	// of the process, so we can't reserve too much memory.
+	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
+	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
+	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
+	_MHeapMap_TotalBits = (_64bit*goos_windows)*35 + (_64bit*(1-goos_windows))*37 + (1-_64bit)*32
+	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
+
+	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+
+	// Max number of threads to run garbage collection.
+	// 2, 3, and 4 are all plausible maximums depending
+	// on the hardware details of the machine.  The garbage
+	// collector scales well to 32 cpus.
+	_MaxGcproc = 32
+)
+
 // Page number (address>>pageShift)
 type pageID uintptr

+const _MaxArena32 = 2 << 30
+
+// OS-defined helpers:
+//
+// sysAlloc obtains a large chunk of zeroed memory from the
+// operating system, typically on the order of a hundred kilobytes
+// or a megabyte.
+// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysUnused notifies the operating system that the contents
+// of the memory region are no longer needed and can be reused
+// for other purposes.
+// SysUsed notifies the operating system that the contents
+// of the memory region are needed again.
+//
+// SysFree returns it unconditionally; this is only used if
+// an out-of-memory error has been detected midway through
+// an allocation.  It is okay if SysFree is a no-op.
+//
+// SysReserve reserves address space without allocating memory.
+// If the pointer passed to it is non-nil, the caller wants the
+// reservation there, but SysReserve can still choose another
+// location if that one is unavailable.  On some systems and in some
+// cases SysReserve will simply check that the address space is
+// available and not actually reserve it.  If SysReserve returns
+// non-nil, it sets *reserved to true if the address space is
+// reserved, false if it has merely been checked.
+// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
+// may use larger alignment, so the caller must be careful to realign the
+// memory obtained by sysAlloc.
+//
+// SysMap maps previously reserved address space for use.
+// The reserved argument is true if the address space was really
+// reserved, not merely checked.
+//
+// SysFault marks a (already sysAlloc'd) region to fault
+// if accessed.  Used only for debugging the runtime.
+
+func mallocinit() {
+	initSizes()
+
+	if class_to_size[_TinySizeClass] != _TinySize {
+		throw("bad TinySizeClass")
+	}
+
+	var p, bitmapSize, spansSize, pSize, limit uintptr
+	var reserved bool
+
+	// limit = runtime.memlimit();
+	// See https://golang.org/issue/5049
+	// TODO(rsc): Fix after 1.1.
+	limit = 0
+
+	// Set up the allocation arena, a contiguous area of memory where
+	// allocated data will be found.  The arena begins with a bitmap large
+	// enough to hold 4 bits per allocated word.
+	if ptrSize == 8 && (limit == 0 || limit > 1<<30) {
+		// On a 64-bit machine, allocate from a single contiguous reservation.
+		// 128 GB (MaxMem) should be big enough for now.
+		//
+		// The code will work with the reservation at any address, but ask
+		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
+		// Allocating a 128 GB region takes away 37 bits, and the amd64
+		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
+		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
+		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
+		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
+		// UTF-8 sequences, and they are otherwise as far away from
+		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
+		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
+		// on OS X during thread allocations.  0x00c0 causes conflicts with
+		// AddressSanitizer which reserves all memory up to 0x0100.
+		// These choices are both for debuggability and to reduce the
+		// odds of the conservative garbage collector not collecting memory
+		// because some non-pointer block of memory had a bit pattern
+		// that matched a memory address.
+		//
+		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
+		// but it hardly matters: e0 00 is not valid UTF-8 either.
+		//
+		// If this fails we fall back to the 32 bit memory mechanism
+		arenaSize := round(_MaxMem, _PageSize)
+		bitmapSize = arenaSize / (ptrSize * 8 / 4)
+		spansSize = arenaSize / _PageSize * ptrSize
+		spansSize = round(spansSize, _PageSize)
+		for i := 0; i <= 0x7f; i++ {
+			p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
+			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+	}
+
+	if p == 0 {
+		// On a 32-bit machine, we can't typically get away
+		// with a giant virtual address space reservation.
+		// Instead we map the memory information bitmap
+		// immediately after the data segment, large enough
+		// to handle another 2GB of mappings (256 MB),
+		// along with a reservation for an initial arena.
+		// When that gets used up, we'll start asking the kernel
+		// for any memory anywhere and hope it's in the 2GB
+		// following the bitmap (presumably the executable begins
+		// near the bottom of memory, so we'll have to use up
+		// most of memory before the kernel resorts to giving out
+		// memory before the beginning of the text segment).
+		//
+		// Alternatively we could reserve 512 MB bitmap, enough
+		// for 4GB of mappings, and then accept any memory the
+		// kernel threw at us, but normally that's a waste of 512 MB
+		// of address space, which is probably too much in a 32-bit world.
+
+		// If we fail to allocate, try again with a smaller arena.
+		// This is necessary on Android L where we share a process
+		// with ART, which reserves virtual memory aggressively.
+		arenaSizes := []uintptr{
+			512 << 20,
+			256 << 20,
+		}
+
+		for _, arenaSize := range arenaSizes {
+			bitmapSize = _MaxArena32 / (ptrSize * 8 / 4)
+			spansSize = _MaxArena32 / _PageSize * ptrSize
+			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
+				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
+				arenaSize = bitmapSize * 8
+				spansSize = arenaSize / _PageSize * ptrSize
+			}
+			spansSize = round(spansSize, _PageSize)
+
+			// SysReserve treats the address we ask for, end, as a hint,
+			// not as an absolute requirement.  If we ask for the end
+			// of the data segment but the operating system requires
+			// a little more space before we can start allocating, it will
+			// give out a slightly higher pointer.  Except QEMU, which
+			// is buggy, as usual: it won't adjust the pointer upward.
+			// So adjust it upward a little bit ourselves: 1/4 MB to get
+			// away from the running binary image and then round up
+			// to a MB boundary.
+			p = round(uintptr(unsafe.Pointer(&end))+(1<<18), 1<<20)
+			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
+			if p != 0 {
+				break
+			}
+		}
+		if p == 0 {
+			throw("runtime: cannot reserve arena virtual address space")
+		}
+	}
+
+	// PageSize can be larger than OS definition of page size,
+	// so SysReserve can give us a PageSize-unaligned pointer.
+	// To overcome this we ask for PageSize more and round up the pointer.
+	p1 := round(p, _PageSize)
+
+	mheap_.spans = (**mspan)(unsafe.Pointer(p1))
+	mheap_.bitmap = p1 + spansSize
+	mheap_.arena_start = p1 + (spansSize + bitmapSize)
+	mheap_.arena_used = mheap_.arena_start
+	mheap_.arena_end = p + pSize
+	mheap_.arena_reserved = reserved
+
+	if mheap_.arena_start&(_PageSize-1) != 0 {
+		println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start))
+		throw("misrounded allocation in mallocinit")
+	}
+
+	// Initialize the rest of the allocator.
+	mHeap_Init(&mheap_, spansSize)
+	_g_ := getg()
+	_g_.m.mcache = allocmcache()
+}
+
+// sysReserveHigh reserves space somewhere high in the address space.
+// sysReserve doesn't actually reserve the full amount requested on
+// 64-bit systems, because of problems with ulimit. Instead it checks
+// that it can get the first 64 kB and assumes it can grab the rest as
+// needed. This doesn't work well with the "let the kernel pick an address"
+// mode, so don't do that. Pick a high address instead.
+func sysReserveHigh(n uintptr, reserved *bool) unsafe.Pointer {
+	if ptrSize == 4 {
+		return sysReserve(nil, n, reserved)
+	}
+
+	for i := 0; i <= 0x7f; i++ {
+		p := uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
+		*reserved = false
+		p = uintptr(sysReserve(unsafe.Pointer(p), n, reserved))
+		if p != 0 {
+			return unsafe.Pointer(p)
+		}
+	}
+
+	return sysReserve(nil, n, reserved)
+}
+
+func mHeap_SysAlloc(h *mheap, n uintptr) unsafe.Pointer {
+	if n > uintptr(h.arena_end)-uintptr(h.arena_used) {
+		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
+		// Reserve some more space.
+		p_size := round(n+_PageSize, 256<<20)
+		new_end := h.arena_end + p_size
+		if new_end <= h.arena_start+_MaxArena32 {
+			// TODO: It would be bad if part of the arena
+			// is reserved and part is not.
+			var reserved bool
+			p := uintptr(sysReserve((unsafe.Pointer)(h.arena_end), p_size, &reserved))
+			if p == h.arena_end {
+				h.arena_end = new_end
+				h.arena_reserved = reserved
+			} else if p+p_size <= h.arena_start+_MaxArena32 {
+				// Keep everything page-aligned.
+				// Our pages are bigger than hardware pages.
+				h.arena_end = p + p_size
+				h.arena_used = p + (-uintptr(p) & (_PageSize - 1))
+				h.arena_reserved = reserved
+			} else {
+				var stat uint64
+				sysFree((unsafe.Pointer)(p), p_size, &stat)
+			}
+		}
+	}
+
+	if n <= uintptr(h.arena_end)-uintptr(h.arena_used) {
+		// Keep taking from our reservation.
+		p := h.arena_used
+		sysMap((unsafe.Pointer)(p), n, h.arena_reserved, &memstats.heap_sys)
+		h.arena_used += n
+		mHeap_MapBits(h)
+		mHeap_MapSpans(h)
+		if raceenabled {
+			racemapshadow((unsafe.Pointer)(p), n)
+		}
+		if mheap_.shadow_enabled {
+			sysMap(unsafe.Pointer(p+mheap_.shadow_heap), n, h.shadow_reserved, &memstats.other_sys)
+		}
+
+		if uintptr(p)&(_PageSize-1) != 0 {
+			throw("misrounded allocation in MHeap_SysAlloc")
+		}
+		return (unsafe.Pointer)(p)
+	}
+
+	// If using 64-bit, our reservation is all we have.
+	if uintptr(h.arena_end)-uintptr(h.arena_start) >= _MaxArena32 {
+		return nil
+	}
+
+	// On 32-bit, once the reservation is gone we can
+	// try to get memory at a location chosen by the OS
+	// and hope that it is in the range we allocated bitmap for.
+	p_size := round(n, _PageSize) + _PageSize
+	p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
+	if p == 0 {
+		return nil
+	}
+
+	if p < h.arena_start || uintptr(p)+p_size-uintptr(h.arena_start) >= _MaxArena32 {
+		print("runtime: memory allocated by OS (", p, ") not in usable range [", hex(h.arena_start), ",", hex(h.arena_start+_MaxArena32), ")\n")
+		sysFree((unsafe.Pointer)(p), p_size, &memstats.heap_sys)
+		return nil
+	}
+
+	p_end := p + p_size
+	p += -p & (_PageSize - 1)
+	if uintptr(p)+n > uintptr(h.arena_used) {
+		h.arena_used = p + n
+		if p_end > h.arena_end {
+			h.arena_end = p_end
+		}
+		mHeap_MapBits(h)
+		mHeap_MapSpans(h)
+		if raceenabled {
+			racemapshadow((unsafe.Pointer)(p), n)
+		}
+	}
+
+	if uintptr(p)&(_PageSize-1) != 0 {
+		throw("misrounded allocation in MHeap_SysAlloc")
+	}
+	return (unsafe.Pointer)(p)
+}
+
 // base address for all 0-byte allocations
 var zerobase uintptr

-// Trigger the concurrent GC when 1/triggerratio memory is available to allocate.
-// Adjust this ratio as part of a scheme to ensure that mutators have enough
-// memory to allocate in durring a concurrent GC cycle.
-var triggerratio = int64(8)
-
-// Determine whether to initiate a GC.
-// If the GC is already working no need to trigger another one.
-// This should establish a feedback loop where if the GC does not
-// have sufficient time to complete then more memory will be
-// requested from the OS increasing heap size thus allow future
-// GCs more time to complete.
-// memstat.heap_alloc and memstat.next_gc reads have benign races
-// A false negative simple does not start a GC, a false positive
-// will start a GC needlessly. Neither have correctness issues.
-func shouldtriggergc() bool {
-	return triggerratio*(int64(memstats.next_gc)-int64(memstats.heap_alloc)) <= int64(memstats.next_gc) && atomicloaduint(&bggc.working) == 0
-}
+const (
+	// flags to malloc
+	_FlagNoScan = 1 << 0 // GC doesn't have to scan object
+	_FlagNoZero = 1 << 1 // don't zero memory
+)

 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
@@ -250,6 +668,25 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 	return x
 }

+func largeAlloc(size uintptr, flag uint32) *mspan {
+	// print("largeAlloc size=", size, "\n")
+
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+	s := mHeap_Alloc(&mheap_, npages, 0, true, flag&_FlagNoZero == 0)
+	if s == nil {
+		throw("out of memory")
+	}
+	s.limit = uintptr(s.start)<<_PageShift + size
+	heapBitsForSpan(s.base()).initSpan(s.layout())
+	return s
+}
+
 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
 	flags := uint32(0)
@@ -310,289 +747,6 @@ func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
 	mProf_Malloc(x, size)
 }

-// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
-// all go routines see the new barrier.
-//go:nowritebarrier
-func gcinstallmarkwb() {
-	gcphase = _GCmark
-}
-
-// force = 0 - start concurrent GC
-// force = 1 - do STW GC regardless of current heap usage
-// force = 2 - go STW GC and eager sweep
-func gogc(force int32) {
-	// The gc is turned off (via enablegc) until the bootstrap has completed.
-	// Also, malloc gets called in the guts of a number of libraries that might be
-	// holding locks. To avoid deadlocks during stoptheworld, don't bother
-	// trying to run gc while holding a lock. The next mallocgc without a lock
-	// will do the gc instead.
-
-	mp := acquirem()
-	if gp := getg(); gp == mp.g0 || mp.locks > 1 || !memstats.enablegc || panicking != 0 || gcpercent < 0 {
-		releasem(mp)
-		return
-	}
-	releasem(mp)
-	mp = nil
-
-	if force == 0 {
-		lock(&bggc.lock)
-		if !bggc.started {
-			bggc.working = 1
-			bggc.started = true
-			go backgroundgc()
-		} else if bggc.working == 0 {
-			bggc.working = 1
-			ready(bggc.g)
-		}
-		unlock(&bggc.lock)
-	} else {
-		gcwork(force)
-	}
-}
-
-func gcwork(force int32) {
-
-	semacquire(&worldsema, false)
-
-	// Pick up the remaining unswept/not being swept spans concurrently
-	for gosweepone() != ^uintptr(0) {
-		sweep.nbgsweep++
-	}
-
-	// Ok, we're doing it!  Stop everybody else
-
-	mp := acquirem()
-	mp.preemptoff = "gcing"
-	releasem(mp)
-	gctimer.count++
-	if force == 0 {
-		gctimer.cycle.sweepterm = nanotime()
-	}
-
-	if trace.enabled {
-		traceGoSched()
-		traceGCStart()
-	}
-
-	// Pick up the remaining unswept/not being swept spans before we STW
-	for gosweepone() != ^uintptr(0) {
-		sweep.nbgsweep++
-	}
-	systemstack(stoptheworld)
-	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
-	if force == 0 {            // Do as much work concurrently as possible
-		gcphase = _GCscan
-		systemstack(starttheworld)
-		gctimer.cycle.scan = nanotime()
-		// Do a concurrent heap scan before we stop the world.
-		systemstack(gcscan_m)
-		gctimer.cycle.installmarkwb = nanotime()
-		systemstack(stoptheworld)
-		systemstack(gcinstallmarkwb)
-		systemstack(harvestwbufs)
-		systemstack(starttheworld)
-		gctimer.cycle.mark = nanotime()
-		systemstack(gcmark_m)
-		gctimer.cycle.markterm = nanotime()
-		systemstack(stoptheworld)
-		systemstack(gcinstalloffwb_m)
-	} else {
-		// For non-concurrent GC (force != 0) g stack have not been scanned so
-		// set gcscanvalid such that mark termination scans all stacks.
-		// No races here since we are in a STW phase.
-		for _, gp := range allgs {
-			gp.gcworkdone = false  // set to true in gcphasework
-			gp.gcscanvalid = false // stack has not been scanned
-		}
-	}
-
-	startTime := nanotime()
-	if mp != acquirem() {
-		throw("gogc: rescheduled")
-	}
-
-	clearpools()
-
-	// Run gc on the g0 stack.  We do this so that the g stack
-	// we're currently running on will no longer change.  Cuts
-	// the root set down a bit (g0 stacks are not scanned, and
-	// we don't need to scan gc's internal state).  We also
-	// need to switch to g0 so we can shrink the stack.
-	n := 1
-	if debug.gctrace > 1 {
-		n = 2
-	}
-	eagersweep := force >= 2
-	for i := 0; i < n; i++ {
-		if i > 0 {
-			// refresh start time if doing a second GC
-			startTime = nanotime()
-		}
-		// switch to g0, call gc, then switch back
-		systemstack(func() {
-			gc_m(startTime, eagersweep)
-		})
-	}
-
-	systemstack(func() {
-		gccheckmark_m(startTime, eagersweep)
-	})
-
-	if trace.enabled {
-		traceGCDone()
-		traceGoStart()
-	}
-
-	// all done
-	mp.preemptoff = ""
-
-	if force == 0 {
-		gctimer.cycle.sweep = nanotime()
-	}
-
-	semrelease(&worldsema)
-
-	if force == 0 {
-		if gctimer.verbose > 1 {
-			GCprinttimes()
-		} else if gctimer.verbose > 0 {
-			calctimes() // ignore result
-		}
-	}
-
-	systemstack(starttheworld)
-
-	releasem(mp)
-	mp = nil
-
-	// now that gc is done, kick off finalizer thread if needed
-	if !concurrentSweep {
-		// give the queued finalizers, if any, a chance to run
-		Gosched()
-	}
-}
-
-// gctimes records the time in nanoseconds of each phase of the concurrent GC.
-type gctimes struct {
-	sweepterm     int64 // stw
-	scan          int64
-	installmarkwb int64 // stw
-	mark          int64
-	markterm      int64 // stw
-	sweep         int64
-}
-
-// gcchronograph holds timer information related to GC phases
-// max records the maximum time spent in each GC phase since GCstarttimes.
-// total records the total time spent in each GC phase since GCstarttimes.
-// cycle records the absolute time (as returned by nanoseconds()) that each GC phase last started at.
-type gcchronograph struct {
-	count    int64
-	verbose  int64
-	maxpause int64
-	max      gctimes
-	total    gctimes
-	cycle    gctimes
-}
-
-var gctimer gcchronograph
-
-// GCstarttimes initializes the gc times. All previous times are lost.
-func GCstarttimes(verbose int64) {
-	gctimer = gcchronograph{verbose: verbose}
-}
-
-// GCendtimes stops the gc timers.
-func GCendtimes() {
-	gctimer.verbose = 0
-}
-
-// calctimes converts gctimer.cycle into the elapsed times, updates gctimer.total
-// and updates gctimer.max with the max pause time.
-func calctimes() gctimes {
-	var times gctimes
-
-	var max = func(a, b int64) int64 {
-		if a > b {
-			return a
-		}
-		return b
-	}
-
-	times.sweepterm = gctimer.cycle.scan - gctimer.cycle.sweepterm
-	gctimer.total.sweepterm += times.sweepterm
-	gctimer.max.sweepterm = max(gctimer.max.sweepterm, times.sweepterm)
-	gctimer.maxpause = max(gctimer.maxpause, gctimer.max.sweepterm)
-
-	times.scan = gctimer.cycle.installmarkwb - gctimer.cycle.scan
-	gctimer.total.scan += times.scan
-	gctimer.max.scan = max(gctimer.max.scan, times.scan)
-
-	times.installmarkwb = gctimer.cycle.mark - gctimer.cycle.installmarkwb
-	gctimer.total.installmarkwb += times.installmarkwb
-	gctimer.max.installmarkwb = max(gctimer.max.installmarkwb, times.installmarkwb)
-	gctimer.maxpause = max(gctimer.maxpause, gctimer.max.installmarkwb)
-
-	times.mark = gctimer.cycle.markterm - gctimer.cycle.mark
-	gctimer.total.mark += times.mark
-	gctimer.max.mark = max(gctimer.max.mark, times.mark)
-
-	times.markterm = gctimer.cycle.sweep - gctimer.cycle.markterm
-	gctimer.total.markterm += times.markterm
-	gctimer.max.markterm = max(gctimer.max.markterm, times.markterm)
-	gctimer.maxpause = max(gctimer.maxpause, gctimer.max.markterm)
-
-	return times
-}
-
-// GCprinttimes prints latency information in nanoseconds about various
-// phases in the GC. The information for each phase includes the maximum pause
-// and total time since the most recent call to GCstarttimes as well as
-// the information from the most recent Concurent GC cycle. Calls from the
-// application to runtime.GC() are ignored.
-func GCprinttimes() {
-	if gctimer.verbose == 0 {
-		println("GC timers not enabled")
-		return
-	}
-
-	// Explicitly put times on the heap so printPhase can use it.
-	times := new(gctimes)
-	*times = calctimes()
-	cycletime := gctimer.cycle.sweep - gctimer.cycle.sweepterm
-	pause := times.sweepterm + times.installmarkwb + times.markterm
-	gomaxprocs := GOMAXPROCS(-1)
-
-	printlock()
-	print("GC: #", gctimer.count, " ", cycletime, "ns @", gctimer.cycle.sweepterm, " pause=", pause, " maxpause=", gctimer.maxpause, " goroutines=", allglen, " gomaxprocs=", gomaxprocs, "\n")
-	printPhase := func(label string, get func(*gctimes) int64, procs int) {
-		print("GC:     ", label, " ", get(times), "ns\tmax=", get(&gctimer.max), "\ttotal=", get(&gctimer.total), "\tprocs=", procs, "\n")
-	}
-	printPhase("sweep term:", func(t *gctimes) int64 { return t.sweepterm }, gomaxprocs)
-	printPhase("scan:      ", func(t *gctimes) int64 { return t.scan }, 1)
-	printPhase("install wb:", func(t *gctimes) int64 { return t.installmarkwb }, gomaxprocs)
-	printPhase("mark:      ", func(t *gctimes) int64 { return t.mark }, 1)
-	printPhase("mark term: ", func(t *gctimes) int64 { return t.markterm }, gomaxprocs)
-	printunlock()
-}
-
-// GC runs a garbage collection.
-func GC() {
-	gogc(2)
-}
-
-// linker-provided
-var noptrdata struct{}
-var enoptrdata struct{}
-var noptrbss struct{}
-var enoptrbss struct{}
-
-// round n up to a multiple of a.  a must be a power of 2.
-func round(n, a uintptr) uintptr {
-	return (n + a - 1) &^ (a - 1)
-}
-
 var persistent struct {
 	lock mutex
 	base unsafe.Pointer

--- a/src/runtime/malloc1.go
+++ b/src/runtime/malloc1.go
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// See malloc.h for overview.
-//
-// TODO(rsc): double-check stats.
-
-package runtime
-
-import "unsafe"
-
-const _MaxArena32 = 2 << 30
-
-// For use by Go. If it were a C enum it would be made available automatically,
-// but the value of MaxMem is too large for enum.
-// XXX - uintptr runtime·maxmem = MaxMem;
-
-func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
-	_g_ := getg()
-
-	_g_.m.mcache.local_nlookup++
-	if ptrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
-		// purge cache stats to prevent overflow
-		lock(&mheap_.lock)
-		purgecachedstats(_g_.m.mcache)
-		unlock(&mheap_.lock)
-	}
-
-	s := mHeap_LookupMaybe(&mheap_, unsafe.Pointer(v))
-	if sp != nil {
-		*sp = s
-	}
-	if s == nil {
-		if base != nil {
-			*base = 0
-		}
-		if size != nil {
-			*size = 0
-		}
-		return 0
-	}
-
-	p := uintptr(s.start) << _PageShift
-	if s.sizeclass == 0 {
-		// Large object.
-		if base != nil {
-			*base = p
-		}
-		if size != nil {
-			*size = s.npages << _PageShift
-		}
-		return 1
-	}
-
-	n := s.elemsize
-	if base != nil {
-		i := (uintptr(v) - uintptr(p)) / n
-		*base = p + i*n
-	}
-	if size != nil {
-		*size = n
-	}
-
-	return 1
-}
-
-//go:nosplit
-func purgecachedstats(c *mcache) {
-	// Protected by either heap or GC lock.
-	h := &mheap_
-	memstats.heap_alloc += uint64(c.local_cachealloc)
-	c.local_cachealloc = 0
-	if trace.enabled {
-		traceHeapAlloc()
-	}
-	memstats.tinyallocs += uint64(c.local_tinyallocs)
-	c.local_tinyallocs = 0
-	memstats.nlookup += uint64(c.local_nlookup)
-	c.local_nlookup = 0
-	h.largefree += uint64(c.local_largefree)
-	c.local_largefree = 0
-	h.nlargefree += uint64(c.local_nlargefree)
-	c.local_nlargefree = 0
-	for i := 0; i < len(c.local_nsmallfree); i++ {
-		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
-		c.local_nsmallfree[i] = 0
-	}
-}
-
-func mallocinit() {
-	initSizes()
-
-	if class_to_size[_TinySizeClass] != _TinySize {
-		throw("bad TinySizeClass")
-	}
-
-	var p, bitmapSize, spansSize, pSize, limit uintptr
-	var reserved bool
-
-	// limit = runtime.memlimit();
-	// See https://golang.org/issue/5049
-	// TODO(rsc): Fix after 1.1.
-	limit = 0
-
-	// Set up the allocation arena, a contiguous area of memory where
-	// allocated data will be found.  The arena begins with a bitmap large
-	// enough to hold 4 bits per allocated word.
-	if ptrSize == 8 && (limit == 0 || limit > 1<<30) {
-		// On a 64-bit machine, allocate from a single contiguous reservation.
-		// 128 GB (MaxMem) should be big enough for now.
-		//
-		// The code will work with the reservation at any address, but ask
-		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
-		// Allocating a 128 GB region takes away 37 bits, and the amd64
-		// doesn't let us choose the top 17 bits, so that leaves the 11 bits
-		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
-		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
-		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
-		// UTF-8 sequences, and they are otherwise as far away from
-		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
-		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
-		// on OS X during thread allocations.  0x00c0 causes conflicts with
-		// AddressSanitizer which reserves all memory up to 0x0100.
-		// These choices are both for debuggability and to reduce the
-		// odds of the conservative garbage collector not collecting memory
-		// because some non-pointer block of memory had a bit pattern
-		// that matched a memory address.
-		//
-		// Actually we reserve 136 GB (because the bitmap ends up being 8 GB)
-		// but it hardly matters: e0 00 is not valid UTF-8 either.
-		//
-		// If this fails we fall back to the 32 bit memory mechanism
-		arenaSize := round(_MaxMem, _PageSize)
-		bitmapSize = arenaSize / (ptrSize * 8 / 4)
-		spansSize = arenaSize / _PageSize * ptrSize
-		spansSize = round(spansSize, _PageSize)
-		for i := 0; i <= 0x7f; i++ {
-			p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
-			pSize = bitmapSize + spansSize + arenaSize + _PageSize
-			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
-			if p != 0 {
-				break
-			}
-		}
-	}
-
-	if p == 0 {
-		// On a 32-bit machine, we can't typically get away
-		// with a giant virtual address space reservation.
-		// Instead we map the memory information bitmap
-		// immediately after the data segment, large enough
-		// to handle another 2GB of mappings (256 MB),
-		// along with a reservation for an initial arena.
-		// When that gets used up, we'll start asking the kernel
-		// for any memory anywhere and hope it's in the 2GB
-		// following the bitmap (presumably the executable begins
-		// near the bottom of memory, so we'll have to use up
-		// most of memory before the kernel resorts to giving out
-		// memory before the beginning of the text segment).
-		//
-		// Alternatively we could reserve 512 MB bitmap, enough
-		// for 4GB of mappings, and then accept any memory the
-		// kernel threw at us, but normally that's a waste of 512 MB
-		// of address space, which is probably too much in a 32-bit world.
-
-		// If we fail to allocate, try again with a smaller arena.
-		// This is necessary on Android L where we share a process
-		// with ART, which reserves virtual memory aggressively.
-		arenaSizes := []uintptr{
-			512 << 20,
-			256 << 20,
-		}
-
-		for _, arenaSize := range arenaSizes {
-			bitmapSize = _MaxArena32 / (ptrSize * 8 / 4)
-			spansSize = _MaxArena32 / _PageSize * ptrSize
-			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
-				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
-				arenaSize = bitmapSize * 8
-				spansSize = arenaSize / _PageSize * ptrSize
-			}
-			spansSize = round(spansSize, _PageSize)
-
-			// SysReserve treats the address we ask for, end, as a hint,
-			// not as an absolute requirement.  If we ask for the end
-			// of the data segment but the operating system requires
-			// a little more space before we can start allocating, it will
-			// give out a slightly higher pointer.  Except QEMU, which
-			// is buggy, as usual: it won't adjust the pointer upward.
-			// So adjust it upward a little bit ourselves: 1/4 MB to get
-			// away from the running binary image and then round up
-			// to a MB boundary.
-			p = round(uintptr(unsafe.Pointer(&end))+(1<<18), 1<<20)
-			pSize = bitmapSize + spansSize + arenaSize + _PageSize
-			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
-			if p != 0 {
-				break
-			}
-		}
-		if p == 0 {
-			throw("runtime: cannot reserve arena virtual address space")
-		}
-	}
-
-	// PageSize can be larger than OS definition of page size,
-	// so SysReserve can give us a PageSize-unaligned pointer.
-	// To overcome this we ask for PageSize more and round up the pointer.
-	p1 := round(p, _PageSize)
-
-	mheap_.spans = (**mspan)(unsafe.Pointer(p1))
-	mheap_.bitmap = p1 + spansSize
-	mheap_.arena_start = p1 + (spansSize + bitmapSize)
-	mheap_.arena_used = mheap_.arena_start
-	mheap_.arena_end = p + pSize
-	mheap_.arena_reserved = reserved
-
-	if mheap_.arena_start&(_PageSize-1) != 0 {
-		println("bad pagesize", hex(p), hex(p1), hex(spansSize), hex(bitmapSize), hex(_PageSize), "start", hex(mheap_.arena_start))
-		throw("misrounded allocation in mallocinit")
-	}
-
-	// Initialize the rest of the allocator.
-	mHeap_Init(&mheap_, spansSize)
-	_g_ := getg()
-	_g_.m.mcache = allocmcache()
-}
-
-// sysReserveHigh reserves space somewhere high in the address space.
-// sysReserve doesn't actually reserve the full amount requested on
-// 64-bit systems, because of problems with ulimit. Instead it checks
-// that it can get the first 64 kB and assumes it can grab the rest as
-// needed. This doesn't work well with the "let the kernel pick an address"
-// mode, so don't do that. Pick a high address instead.
-func sysReserveHigh(n uintptr, reserved *bool) unsafe.Pointer {
-	if ptrSize == 4 {
-		return sysReserve(nil, n, reserved)
-	}
-
-	for i := 0; i <= 0x7f; i++ {
-		p := uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
-		*reserved = false
-		p = uintptr(sysReserve(unsafe.Pointer(p), n, reserved))
-		if p != 0 {
-			return unsafe.Pointer(p)
-		}
-	}
-
-	return sysReserve(nil, n, reserved)
-}
-
-func mHeap_SysAlloc(h *mheap, n uintptr) unsafe.Pointer {
-	if n > uintptr(h.arena_end)-uintptr(h.arena_used) {
-		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
-		// Reserve some more space.
-		p_size := round(n+_PageSize, 256<<20)
-		new_end := h.arena_end + p_size
-		if new_end <= h.arena_start+_MaxArena32 {
-			// TODO: It would be bad if part of the arena
-			// is reserved and part is not.
-			var reserved bool
-			p := uintptr(sysReserve((unsafe.Pointer)(h.arena_end), p_size, &reserved))
-			if p == h.arena_end {
-				h.arena_end = new_end
-				h.arena_reserved = reserved
-			} else if p+p_size <= h.arena_start+_MaxArena32 {
-				// Keep everything page-aligned.
-				// Our pages are bigger than hardware pages.
-				h.arena_end = p + p_size
-				h.arena_used = p + (-uintptr(p) & (_PageSize - 1))
-				h.arena_reserved = reserved
-			} else {
-				var stat uint64
-				sysFree((unsafe.Pointer)(p), p_size, &stat)
-			}
-		}
-	}
-
-	if n <= uintptr(h.arena_end)-uintptr(h.arena_used) {
-		// Keep taking from our reservation.
-		p := h.arena_used
-		sysMap((unsafe.Pointer)(p), n, h.arena_reserved, &memstats.heap_sys)
-		h.arena_used += n
-		mHeap_MapBits(h)
-		mHeap_MapSpans(h)
-		if raceenabled {
-			racemapshadow((unsafe.Pointer)(p), n)
-		}
-		if mheap_.shadow_enabled {
-			sysMap(unsafe.Pointer(p+mheap_.shadow_heap), n, h.shadow_reserved, &memstats.other_sys)
-		}
-
-		if uintptr(p)&(_PageSize-1) != 0 {
-			throw("misrounded allocation in MHeap_SysAlloc")
-		}
-		return (unsafe.Pointer)(p)
-	}
-
-	// If using 64-bit, our reservation is all we have.
-	if uintptr(h.arena_end)-uintptr(h.arena_start) >= _MaxArena32 {
-		return nil
-	}
-
-	// On 32-bit, once the reservation is gone we can
-	// try to get memory at a location chosen by the OS
-	// and hope that it is in the range we allocated bitmap for.
-	p_size := round(n, _PageSize) + _PageSize
-	p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
-	if p == 0 {
-		return nil
-	}
-
-	if p < h.arena_start || uintptr(p)+p_size-uintptr(h.arena_start) >= _MaxArena32 {
-		print("runtime: memory allocated by OS (", p, ") not in usable range [", hex(h.arena_start), ",", hex(h.arena_start+_MaxArena32), ")\n")
-		sysFree((unsafe.Pointer)(p), p_size, &memstats.heap_sys)
-		return nil
-	}
-
-	p_end := p + p_size
-	p += -p & (_PageSize - 1)
-	if uintptr(p)+n > uintptr(h.arena_used) {
-		h.arena_used = p + n
-		if p_end > h.arena_end {
-			h.arena_end = p_end
-		}
-		mHeap_MapBits(h)
-		mHeap_MapSpans(h)
-		if raceenabled {
-			racemapshadow((unsafe.Pointer)(p), n)
-		}
-	}
-
-	if uintptr(p)&(_PageSize-1) != 0 {
-		throw("misrounded allocation in MHeap_SysAlloc")
-	}
-	return (unsafe.Pointer)(p)
-}
-
-var end struct{}
-
-func largeAlloc(size uintptr, flag uint32) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-	s := mHeap_Alloc(&mheap_, npages, 0, true, flag&_FlagNoZero == 0)
-	if s == nil {
-		throw("out of memory")
-	}
-	s.limit = uintptr(s.start)<<_PageShift + size
-	heapBitsForSpan(s.base()).initSpan(s.layout())
-	return s
-}
--- a/src/runtime/malloc2.go
+++ b/src/runtime/malloc2.go
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// Memory allocator, based on tcmalloc.
-// http://goog-perftools.sourceforge.net/doc/tcmalloc.html
-
-// The main allocator works in runs of pages.
-// Small allocation sizes (up to and including 32 kB) are
-// rounded to one of about 100 size classes, each of which
-// has its own free list of objects of exactly that size.
-// Any free page of memory can be split into a set of objects
-// of one size class, which are then managed using free list
-// allocators.
-//
-// The allocator's data structures are:
-//
-//	FixAlloc: a free-list allocator for fixed-size objects,
-//		used to manage storage used by the allocator.
-//	MHeap: the malloc heap, managed at page (4096-byte) granularity.
-//	MSpan: a run of pages managed by the MHeap.
-//	MCentral: a shared free list for a given size class.
-//	MCache: a per-thread (in Go, per-P) cache for small objects.
-//	MStats: allocation statistics.
-//
-// Allocating a small object proceeds up a hierarchy of caches:
-//
-//	1. Round the size up to one of the small size classes
-//	   and look in the corresponding MCache free list.
-//	   If the list is not empty, allocate an object from it.
-//	   This can all be done without acquiring a lock.
-//
-//	2. If the MCache free list is empty, replenish it by
-//	   taking a bunch of objects from the MCentral free list.
-//	   Moving a bunch amortizes the cost of acquiring the MCentral lock.
-//
-//	3. If the MCentral free list is empty, replenish it by
-//	   allocating a run of pages from the MHeap and then
-//	   chopping that memory into objects of the given size.
-//	   Allocating many objects amortizes the cost of locking
-//	   the heap.
-//
-//	4. If the MHeap is empty or has no page runs large enough,
-//	   allocate a new group of pages (at least 1MB) from the
-//	   operating system.  Allocating a large run of pages
-//	   amortizes the cost of talking to the operating system.
-//
-// Freeing a small object proceeds up the same hierarchy:
-//
-//	1. Look up the size class for the object and add it to
-//	   the MCache free list.
-//
-//	2. If the MCache free list is too long or the MCache has
-//	   too much memory, return some to the MCentral free lists.
-//
-//	3. If all the objects in a given span have returned to
-//	   the MCentral list, return that span to the page heap.
-//
-//	4. If the heap has too much memory, return some to the
-//	   operating system.
-//
-//	TODO(rsc): Step 4 is not implemented.
-//
-// Allocating and freeing a large object uses the page heap
-// directly, bypassing the MCache and MCentral free lists.
-//
-// The small objects on the MCache and MCentral free lists
-// may or may not be zeroed.  They are zeroed if and only if
-// the second word of the object is zero.  A span in the
-// page heap is zeroed unless s->needzero is set. When a span
-// is allocated to break into small objects, it is zeroed if needed
-// and s->needzero is set. There are two main benefits to delaying the
-// zeroing this way:
-//
-//	1. stack frames allocated from the small object lists
-//	   or the page heap can avoid zeroing altogether.
-//	2. the cost of zeroing when reusing a small object is
-//	   charged to the mutator, not the garbage collector.
-//
-// This C code was written with an eye toward translating to Go
-// in the future.  Methods have the form Type_Method(Type *t, ...).
-
-const (
-	_PageShift = 13
-	_PageSize  = 1 << _PageShift
-	_PageMask  = _PageSize - 1
-)
-
-const (
-	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
-	_64bit = 1 << (^uintptr(0) >> 63) / 2
-
-	// Computed constant.  The definition of MaxSmallSize and the
-	// algorithm in msize.c produce some number of different allocation
-	// size classes.  NumSizeClasses is that number.  It's needed here
-	// because there are static arrays of this length; when msize runs its
-	// size choosing algorithm it double-checks that NumSizeClasses agrees.
-	_NumSizeClasses = 67
-
-	// Tunable constants.
-	_MaxSmallSize = 32 << 10
-
-	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
-	_TinySize      = 16
-	_TinySizeClass = 2
-
-	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
-	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
-	_HeapAllocChunk = 1 << 20                // Chunk size for heap growth
-
-	// Per-P, per order stack segment cache size.
-	_StackCacheSize = 32 * 1024
-
-	// Number of orders that get caching.  Order 0 is FixedStack
-	// and each successive order is twice as large.
-	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks.  Larger stacks
-	// will be allocated directly.
-	// Since FixedStack is different on different systems, we
-	// must vary NumStackOrders to keep the same maximum cached size.
-	//   OS               | FixedStack | NumStackOrders
-	//   -----------------+------------+---------------
-	//   linux/darwin/bsd | 2KB        | 4
-	//   windows/32       | 4KB        | 3
-	//   windows/64       | 8KB        | 2
-	//   plan9            | 4KB        | 3
-	_NumStackOrders = 4 - ptrSize/4*goos_windows - 1*goos_plan9
-
-	// Number of bits in page to span calculations (4k pages).
-	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
-	// Windows counts memory used by page table into committed memory
-	// of the process, so we can't reserve too much memory.
-	// See http://golang.org/issue/5402 and http://golang.org/issue/5236.
-	// On other 64-bit platforms, we limit the arena to 128GB, or 37 bits.
-	// On 32-bit, we don't bother limiting anything, so we use the full 32-bit address.
-	_MHeapMap_TotalBits = (_64bit*goos_windows)*35 + (_64bit*(1-goos_windows))*37 + (1-_64bit)*32
-	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
-
-	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
-
-	// Max number of threads to run garbage collection.
-	// 2, 3, and 4 are all plausible maximums depending
-	// on the hardware details of the machine.  The garbage
-	// collector scales well to 32 cpus.
-	_MaxGcproc = 32
-)
-
-// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
-// Since assignments to mlink.next will result in a write barrier being preformed
-// this can not be used by some of the internal GC structures. For example when
-// the sweeper is placing an unmarked object on the free list it does not want the
-// write barrier to be called since that could result in the object being reachable.
-type mlink struct {
-	next *mlink
-}
-
-// A gclink is a node in a linked list of blocks, like mlink,
-// but it is opaque to the garbage collector.
-// The GC does not trace the pointers during collection,
-// and the compiler does not emit write barriers for assignments
-// of gclinkptr values. Code should store references to gclinks
-// as gclinkptr, not as *gclink.
-type gclink struct {
-	next gclinkptr
-}
-
-// A gclinkptr is a pointer to a gclink, but it is opaque
-// to the garbage collector.
-type gclinkptr uintptr
-
-// ptr returns the *gclink form of p.
-// The result should be used for accessing fields, not stored
-// in other data structures.
-func (p gclinkptr) ptr() *gclink {
-	return (*gclink)(unsafe.Pointer(p))
-}
-
-// sysAlloc obtains a large chunk of zeroed memory from the
-// operating system, typically on the order of a hundred kilobytes
-// or a megabyte.
-// NOTE: sysAlloc returns OS-aligned memory, but the heap allocator
-// may use larger alignment, so the caller must be careful to realign the
-// memory obtained by sysAlloc.
-//
-// SysUnused notifies the operating system that the contents
-// of the memory region are no longer needed and can be reused
-// for other purposes.
-// SysUsed notifies the operating system that the contents
-// of the memory region are needed again.
-//
-// SysFree returns it unconditionally; this is only used if
-// an out-of-memory error has been detected midway through
-// an allocation.  It is okay if SysFree is a no-op.
-//
-// SysReserve reserves address space without allocating memory.
-// If the pointer passed to it is non-nil, the caller wants the
-// reservation there, but SysReserve can still choose another
-// location if that one is unavailable.  On some systems and in some
-// cases SysReserve will simply check that the address space is
-// available and not actually reserve it.  If SysReserve returns
-// non-nil, it sets *reserved to true if the address space is
-// reserved, false if it has merely been checked.
-// NOTE: SysReserve returns OS-aligned memory, but the heap allocator
-// may use larger alignment, so the caller must be careful to realign the
-// memory obtained by sysAlloc.
-//
-// SysMap maps previously reserved address space for use.
-// The reserved argument is true if the address space was really
-// reserved, not merely checked.
-//
-// SysFault marks a (already sysAlloc'd) region to fault
-// if accessed.  Used only for debugging the runtime.
-
-// FixAlloc is a simple free-list allocator for fixed size objects.
-// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
-// MCache and MSpan objects.
-//
-// Memory returned by FixAlloc_Alloc is not zeroed.
-// The caller is responsible for locking around FixAlloc calls.
-// Callers can keep state in the object but the first word is
-// smashed by freeing and reallocating.
-type fixalloc struct {
-	size   uintptr
-	first  unsafe.Pointer // go func(unsafe.pointer, unsafe.pointer); f(arg, p) called first time p is returned
-	arg    unsafe.Pointer
-	list   *mlink
-	chunk  *byte
-	nchunk uint32
-	inuse  uintptr // in-use bytes now
-	stat   *uint64
-}
-
-// Statistics.
-// Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
-type mstats struct {
-	// General statistics.
-	alloc       uint64 // bytes allocated and still in use
-	total_alloc uint64 // bytes allocated (even if freed)
-	sys         uint64 // bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
-	nlookup     uint64 // number of pointer lookups
-	nmalloc     uint64 // number of mallocs
-	nfree       uint64 // number of frees
-
-	// Statistics about malloc heap.
-	// protected by mheap.lock
-	heap_alloc    uint64 // bytes allocated and still in use
-	heap_sys      uint64 // bytes obtained from system
-	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in non-idle spans
-	heap_released uint64 // bytes released to the os
-	heap_objects  uint64 // total number of allocated objects
-
-	// Statistics about allocation of low-level fixed-size structures.
-	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // this number is included in heap_inuse above
-	stacks_sys   uint64 // always 0 in mstats
-	mspan_inuse  uint64 // mspan structures
-	mspan_sys    uint64
-	mcache_inuse uint64 // mcache structures
-	mcache_sys   uint64
-	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64
-	other_sys    uint64
-
-	// Statistics about garbage collector.
-	// Protected by mheap or stopping the world during GC.
-	next_gc        uint64 // next gc (in heap_alloc time)
-	last_gc        uint64 // last gc (in absolute time)
-	pause_total_ns uint64
-	pause_ns       [256]uint64 // circular buffer of recent gc pause lengths
-	pause_end      [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
-	numgc          uint32
-	enablegc       bool
-	debuggc        bool
-
-	// Statistics about allocation size classes.
-
-	by_size [_NumSizeClasses]struct {
-		size    uint32
-		nmalloc uint64
-		nfree   uint64
-	}
-
-	tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
-}
-
-var memstats mstats
-
-// Size classes.  Computed and initialized by InitSizes.
-//
-// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
-//	1 <= sizeclass < NumSizeClasses, for n.
-//	Size class 0 is reserved to mean "not small".
-//
-// class_to_size[i] = largest size in class i
-// class_to_allocnpages[i] = number of pages to allocate when
-//	making new objects in class i
-
-var class_to_size [_NumSizeClasses]int32
-var class_to_allocnpages [_NumSizeClasses]int32
-var size_to_class8 [1024/8 + 1]int8
-var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
-
-type mcachelist struct {
-	list  *mlink
-	nlist uint32
-}
-
-type stackfreelist struct {
-	list gclinkptr // linked list of free stacks
-	size uintptr   // total size of stacks in list
-}
-
-// Per-thread (in Go, per-P) cache for small objects.
-// No locking needed because it is per-thread (per-P).
-type mcache struct {
-	// The following members are accessed on every malloc,
-	// so they are grouped here for better caching.
-	next_sample      int32  // trigger heap sample after allocating this many bytes
-	local_cachealloc intptr // bytes allocated (or freed) from cache since last lock of heap
-	// Allocator cache for tiny objects w/o pointers.
-	// See "Tiny allocator" comment in malloc.go.
-	tiny             unsafe.Pointer
-	tinyoffset       uintptr
-	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
-
-	// The rest is not accessed on every malloc.
-	alloc [_NumSizeClasses]*mspan // spans to allocate from
-
-	stackcache [_NumStackOrders]stackfreelist
-
-	sudogcache *sudog
-
-	// Local allocator stats, flushed during GC.
-	local_nlookup    uintptr                  // number of pointer lookups
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
-}
-
-const (
-	_KindSpecialFinalizer = 1
-	_KindSpecialProfile   = 2
-	// Note: The finalizer special must be first because if we're freeing
-	// an object, a finalizer special will cause the freeing operation
-	// to abort, and we want to keep the other special records around
-	// if that happens.
-)
-
-type special struct {
-	next   *special // linked list in span
-	offset uint16   // span offset of object
-	kind   byte     // kind of special
-}
-
-// The described object has a finalizer set for it.
-type specialfinalizer struct {
-	special special
-	fn      *funcval
-	nret    uintptr
-	fint    *_type
-	ot      *ptrtype
-}
-
-// The described object is being heap profiled.
-type specialprofile struct {
-	special special
-	b       *bucket
-}
-
-// An MSpan is a run of pages.
-const (
-	_MSpanInUse = iota // allocated for garbage collected heap
-	_MSpanStack        // allocated for use by stack allocator
-	_MSpanFree
-	_MSpanListHead
-	_MSpanDead
-)
-
-type mspan struct {
-	next     *mspan    // in a span linked list
-	prev     *mspan    // in a span linked list
-	start    pageID    // starting page number
-	npages   uintptr   // number of pages in span
-	freelist gclinkptr // list of free objects
-	// sweep generation:
-	// if sweepgen == h->sweepgen - 2, the span needs sweeping
-	// if sweepgen == h->sweepgen - 1, the span is currently being swept
-	// if sweepgen == h->sweepgen, the span is swept and ready to use
-	// h->sweepgen is incremented by 2 after every GC
-	sweepgen    uint32
-	ref         uint16   // capacity - number of objects in freelist
-	sizeclass   uint8    // size class
-	incache     bool     // being used by an mcache
-	state       uint8    // mspaninuse etc
-	needzero    uint8    // needs to be zeroed before allocation
-	elemsize    uintptr  // computed from sizeclass or from npages
-	unusedsince int64    // first time spotted by gc in mspanfree state
-	npreleased  uintptr  // number of pages released to the os
-	limit       uintptr  // end of data in span
-	speciallock mutex    // guards specials list
-	specials    *special // linked list of special records sorted by offset.
-}
-
-func (s *mspan) base() uintptr {
-	return uintptr(s.start << _PageShift)
-}
-
-func (s *mspan) layout() (size, n, total uintptr) {
-	total = s.npages << _PageShift
-	size = s.elemsize
-	if size > 0 {
-		n = total / size
-	}
-	return
-}
-
-// Every MSpan is in one doubly-linked list,
-// either one of the MHeap's free lists or one of the
-// MCentral's span lists.  We use empty MSpan structures as list heads.
-
-// Central list of free objects of a given size.
-type mcentral struct {
-	lock      mutex
-	sizeclass int32
-	nonempty  mspan // list of spans with a free object
-	empty     mspan // list of spans with no free objects (or cached in an mcache)
-}
-
-// Main malloc heap.
-// The heap itself is the "free[]" and "large" arrays,
-// but all the other global data is here too.
-type mheap struct {
-	lock      mutex
-	free      [_MaxMHeapList]mspan // free lists of given length
-	freelarge mspan                // free lists length >= _MaxMHeapList
-	busy      [_MaxMHeapList]mspan // busy lists of large objects of given length
-	busylarge mspan                // busy lists of large objects length >= _MaxMHeapList
-	allspans  **mspan              // all spans out there
-	gcspans   **mspan              // copy of allspans referenced by gc marker or sweeper
-	nspan     uint32
-	sweepgen  uint32 // sweep generation, see comment in mspan
-	sweepdone uint32 // all spans are swept
-
-	// span lookup
-	spans        **mspan
-	spans_mapped uintptr
-
-	// range of addresses we might see in the heap
-	bitmap         uintptr
-	bitmap_mapped  uintptr
-	arena_start    uintptr
-	arena_used     uintptr
-	arena_end      uintptr
-	arena_reserved bool
-
-	// write barrier shadow data+heap.
-	// 64-bit systems only, enabled by GODEBUG=wbshadow=1.
-	shadow_enabled  bool    // shadow should be updated and checked
-	shadow_reserved bool    // shadow memory is reserved
-	shadow_heap     uintptr // heap-addr + shadow_heap = shadow heap addr
-	shadow_data     uintptr // data-addr + shadow_data = shadow data addr
-	data_start      uintptr // start of shadowed data addresses
-	data_end        uintptr // end of shadowed data addresses
-
-	// central free lists for small size classes.
-	// the padding makes sure that the MCentrals are
-	// spaced CacheLineSize bytes apart, so that each MCentral.lock
-	// gets its own cache line.
-	central [_NumSizeClasses]struct {
-		mcentral mcentral
-		pad      [_CacheLineSize]byte
-	}
-
-	spanalloc             fixalloc // allocator for span*
-	cachealloc            fixalloc // allocator for mcache*
-	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
-	specialprofilealloc   fixalloc // allocator for specialprofile*
-	speciallock           mutex    // lock for sepcial record allocators.
-
-	// Malloc stats.
-	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
-}
-
-var mheap_ mheap
-
-const (
-	// flags to malloc
-	_FlagNoScan = 1 << 0 // GC doesn't have to scan object
-	_FlagNoZero = 1 << 1 // don't zero memory
-)
-
-// NOTE: Layout known to queuefinalizer.
-type finalizer struct {
-	fn   *funcval       // function to call
-	arg  unsafe.Pointer // ptr to object
-	nret uintptr        // bytes of return values from fn
-	fint *_type         // type of first argument of fn
-	ot   *ptrtype       // type of ptr to object
-}
-
-type finblock struct {
-	alllink *finblock
-	next    *finblock
-	cnt     int32
-	_       int32
-	fin     [(_FinBlockSize - 2*ptrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
-}
-
-// Information from the compiler about the layout of stack frames.
-type bitvector struct {
-	n        int32 // # of bits
-	bytedata *uint8
-}
-
-type stackmap struct {
-	n        int32   // number of bitmaps
-	nbit     int32   // number of bits in each bitmap
-	bytedata [1]byte // bitmaps, each starting on a 32-bit boundary
-}
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -82,6 +82,12 @@ const (
 	typeShift       = 2
 )

+// Information from the compiler about the layout of stack frames.
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
+}
+
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
 func addb(p *byte, n uintptr) *byte {

--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -2,14 +2,63 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// Per-P malloc cache for small objects.
-//
-// See malloc.h for an overview.
-
 package runtime

 import "unsafe"

+// Per-thread (in Go, per-P) cache for small objects.
+// No locking needed because it is per-thread (per-P).
+type mcache struct {
+	// The following members are accessed on every malloc,
+	// so they are grouped here for better caching.
+	next_sample      int32  // trigger heap sample after allocating this many bytes
+	local_cachealloc intptr // bytes allocated (or freed) from cache since last lock of heap
+	// Allocator cache for tiny objects w/o pointers.
+	// See "Tiny allocator" comment in malloc.go.
+	tiny             unsafe.Pointer
+	tinyoffset       uintptr
+	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+
+	// The rest is not accessed on every malloc.
+	alloc [_NumSizeClasses]*mspan // spans to allocate from
+
+	stackcache [_NumStackOrders]stackfreelist
+
+	sudogcache *sudog
+
+	// Local allocator stats, flushed during GC.
+	local_nlookup    uintptr                  // number of pointer lookups
+	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
+	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
+	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
+}
+
+// A gclink is a node in a linked list of blocks, like mlink,
+// but it is opaque to the garbage collector.
+// The GC does not trace the pointers during collection,
+// and the compiler does not emit write barriers for assignments
+// of gclinkptr values. Code should store references to gclinks
+// as gclinkptr, not as *gclink.
+type gclink struct {
+	next gclinkptr
+}
+
+// A gclinkptr is a pointer to a gclink, but it is opaque
+// to the garbage collector.
+type gclinkptr uintptr
+
+// ptr returns the *gclink form of p.
+// The result should be used for accessing fields, not stored
+// in other data structures.
+func (p gclinkptr) ptr() *gclink {
+	return (*gclink)(unsafe.Pointer(p))
+}
+
+type stackfreelist struct {
+	list gclinkptr // linked list of free stacks
+	size uintptr   // total size of stacks in list
+}
+
 // dummy MSpan that contains no free objects.
 var emptymspan mspan


--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -12,6 +12,14 @@

 package runtime

+// Central list of free objects of a given size.
+type mcentral struct {
+	lock      mutex
+	sizeclass int32
+	nonempty  mspan // list of spans with a free object
+	empty     mspan // list of spans with no free objects (or cached in an mcache)
+}
+
 // Initialize a single central free list.
 func mCentral_Init(c *mcentral, sizeclass int32) {
 	c.sizeclass = sizeclass

--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -8,6 +8,14 @@ package runtime

 import "unsafe"

+type finblock struct {
+	alllink *finblock
+	next    *finblock
+	cnt     int32
+	_       int32
+	fin     [(_FinBlockSize - 2*ptrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
+}
+
 var finlock mutex  // protects the following variables
 var fing *g        // goroutine that runs finalizers
 var finq *finblock // list of finalizers that are to be executed
@@ -17,6 +25,15 @@ var fingwait bool
 var fingwake bool
 var allfin *finblock // list of all blocks

+// NOTE: Layout known to queuefinalizer.
+type finalizer struct {
+	fn   *funcval       // function to call
+	arg  unsafe.Pointer // ptr to object
+	nret uintptr        // bytes of return values from fn
+	fint *_type         // type of first argument of fn
+	ot   *ptrtype       // type of ptr to object
+}
+
 var finalizer1 = [...]byte{
 	// Each Finalizer is 5 words, ptr ptr uintptr ptr ptr.
 	// Each byte describes 4 words.

--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -10,6 +10,34 @@ package runtime

 import "unsafe"

+// FixAlloc is a simple free-list allocator for fixed size objects.
+// Malloc uses a FixAlloc wrapped around sysAlloc to manages its
+// MCache and MSpan objects.
+//
+// Memory returned by FixAlloc_Alloc is not zeroed.
+// The caller is responsible for locking around FixAlloc calls.
+// Callers can keep state in the object but the first word is
+// smashed by freeing and reallocating.
+type fixalloc struct {
+	size   uintptr
+	first  unsafe.Pointer // go func(unsafe.pointer, unsafe.pointer); f(arg, p) called first time p is returned
+	arg    unsafe.Pointer
+	list   *mlink
+	chunk  *byte
+	nchunk uint32
+	inuse  uintptr // in-use bytes now
+	stat   *uint64
+}
+
+// A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
+// Since assignments to mlink.next will result in a write barrier being preformed
+// this can not be used by some of the internal GC structures. For example when
+// the sweeper is placing an unmarked object on the free list it does not want the
+// write barrier to be called since that could result in the object being reachable.
+type mlink struct {
+	next *mlink
+}
+
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
 func fixAlloc_Init(f *fixalloc, size uintptr, first func(unsafe.Pointer, unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {

--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -131,54 +131,8 @@ const (
 	_RootCount       = 5
 )

-// ptrmask for an allocation containing a single pointer.
-var oneptr = [...]uint8{typePointer}
-
-// Initialized from $GOGC.  GOGC=off means no GC.
-var gcpercent int32
-
-// Holding worldsema grants an M the right to try to stop the world.
-// The procedure is:
-//
-//	semacquire(&worldsema);
-//	m.preemptoff = "reason";
-//	stoptheworld();
-//
-//	... do stuff ...
-//
-//	m.preemptoff = "";
-//	semrelease(&worldsema);
-//	starttheworld();
-//
-var worldsema uint32 = 1
-
-var data, edata, bss, ebss, gcdata, gcbss struct{}
-
-var gcdatamask bitvector
-var gcbssmask bitvector
-
-var gclock mutex
-
-var badblock [1024]uintptr
-var nbadblock int32
-
-type workdata struct {
-	full    uint64                // lock-free list of full blocks workbuf
-	empty   uint64                // lock-free list of empty blocks workbuf
-	partial uint64                // lock-free list of partially filled blocks workbuf
-	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
-	nproc   uint32
-	tstart  int64
-	nwait   uint32
-	ndone   uint32
-	alldone note
-	markfor *parfor
-
-	// Copy of mheap.allspans for marker or sweeper.
-	spans []*mspan
-}
-
-var work workdata
+// linker-provided
+var data, edata, bss, ebss, gcdata, gcbss, noptrdata, enoptrdata, noptrbss, enoptrbss, end struct{}

 //go:linkname weak_cgo_allocate go.weak.runtime._cgo_allocate_internal
 var weak_cgo_allocate byte
@@ -189,45 +143,6 @@ func have_cgo_allocate() bool {
 	return &weak_cgo_allocate != nil
 }

-// To help debug the concurrent GC we remark with the world
-// stopped ensuring that any object encountered has their normal
-// mark bit set. To do this we use an orthogonal bit
-// pattern to indicate the object is marked. The following pattern
-// uses the upper two bits in the object's bounday nibble.
-// 01: scalar  not marked
-// 10: pointer not marked
-// 11: pointer     marked
-// 00: scalar      marked
-// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
-// The higher bit is 1 for pointers and 0 for scalars, whether the object
-// is marked or not.
-// The first nibble no longer holds the typeDead pattern indicating that the
-// there are no more pointers in the object. This information is held
-// in the second nibble.
-
-// When marking an object if the bool checkmarkphase is true one uses the above
-// encoding, otherwise one uses the bitMarked bit in the lower two bits
-// of the nibble.
-var checkmarkphase = false
-
-// inheap reports whether b is a pointer into a (potentially dead) heap object.
-// It returns false for pointers into stack spans.
-//go:nowritebarrier
-func inheap(b uintptr) bool {
-	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
-		return false
-	}
-	// Not a beginning of a block, consult span table to find the block beginning.
-	k := b >> _PageShift
-	x := k
-	x -= mheap_.arena_start >> _PageShift
-	s := h_spans[x]
-	if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
-		return false
-	}
-	return true
-}
-
 // Slow for now as we serialize this, since this is on a debug path
 // speed is not critical at this point.
 var andlock mutex
@@ -239,786 +154,115 @@ func atomicand8(src *byte, val byte) {
 	unlock(&andlock)
 }

-// When in GCmarkterminate phase we allocate black.
-//go:nowritebarrier
-func gcmarknewobject_m(obj uintptr) {
-	if gcphase != _GCmarktermination {
-		throw("marking new object while not in mark termination phase")
-	}
-	if checkmarkphase { // The world should be stopped so this should not happen.
-		throw("gcmarknewobject called while doing checkmark")
-	}
-
-	heapBitsForAddr(obj).setMarked()
-}
-
-// obj is the start of an object with mark mbits.
-// If it isn't already marked, mark it and enqueue into workbuf.
-// Return possibly new workbuf to use.
-// base and off are for debugging only and could be removed.
-//go:nowritebarrier
-func greyobject(obj, base, off uintptr, hbits heapBits, gcw *gcWorkProducer) {
-	// obj should be start of allocation, and so must be at least pointer-aligned.
-	if obj&(ptrSize-1) != 0 {
-		throw("greyobject: obj not pointer-aligned")
-	}
-
-	if checkmarkphase {
-		if !hbits.isMarked() {
-			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
-			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
-
-			// Dump the source (base) object
-
-			kb := base >> _PageShift
-			xb := kb
-			xb -= mheap_.arena_start >> _PageShift
-			sb := h_spans[xb]
-			printlock()
-			print("runtime:greyobject Span: base=", hex(base), " kb=", hex(kb))
-			if sb == nil {
-				print(" sb=nil\n")
-			} else {
-				print(" sb.start*_PageSize=", hex(sb.start*_PageSize), " sb.limit=", hex(sb.limit), " sb.sizeclass=", sb.sizeclass, " sb.elemsize=", sb.elemsize, "\n")
-				// base is (a pointer to) the source object holding the reference to object. Create a pointer to each of the fields
-				// fields in base and print them out as hex values.
-				for i := 0; i < int(sb.elemsize/ptrSize); i++ {
-					print(" *(base+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(base + uintptr(i)*ptrSize))), "\n")
-				}
-			}
-
-			// Dump the object
-
-			k := obj >> _PageShift
-			x := k
-			x -= mheap_.arena_start >> _PageShift
-			s := h_spans[x]
-			print("runtime:greyobject Span: obj=", hex(obj), " k=", hex(k))
-			if s == nil {
-				print(" s=nil\n")
-			} else {
-				print(" s.start=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
-				// NOTE(rsc): This code is using s.sizeclass as an approximation of the
-				// number of pointer-sized words in an object. Perhaps not what was intended.
-				for i := 0; i < int(s.sizeclass); i++ {
-					print(" *(obj+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)*ptrSize))), "\n")
-				}
-			}
-			throw("checkmark found unmarked object")
-		}
-		if !hbits.isCheckmarked() {
-			return
-		}
-		hbits.setCheckmarked()
-		if !hbits.isCheckmarked() {
-			throw("setCheckmarked and isCheckmarked disagree")
-		}
-	} else {
-		// If marked we have nothing to do.
-		if hbits.isMarked() {
-			return
-		}
-
-		// Each byte of GC bitmap holds info for two words.
-		// Might be racing with other updates, so use atomic update always.
-		// We used to be clever here and use a non-atomic update in certain
-		// cases, but it's not worth the risk.
-		hbits.setMarked()
-	}
-
-	if !checkmarkphase && hbits.typeBits() == typeDead {
-		return // noscan object
-	}
-
-	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
-	// seems like a nice optimization that can be added back in.
-	// There needs to be time between the PREFETCH and the use.
-	// Previously we put the obj in an 8 element buffer that is drained at a rate
-	// to give the PREFETCH time to do its work.
-	// Use of PREFETCHNTA might be more appropriate than PREFETCH
-
-	gcw.put(obj)
-}
-
-// Scan the object b of size n, adding pointers to wbuf.
-// Return possibly new wbuf to use.
-// If ptrmask != nil, it specifies where pointers are in b.
-// If ptrmask == nil, the GC bitmap should be consulted.
-// In this case, n may be an overestimate of the size; the GC bitmap
-// must also be used to make sure the scan stops at the end of b.
-//go:nowritebarrier
-func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWorkProducer) {
-	arena_start := mheap_.arena_start
-	arena_used := mheap_.arena_used
-
-	// Find bits of the beginning of the object.
-	var hbits heapBits
-	if ptrmask == nil {
-		b, hbits = heapBitsForObject(b)
-		if b == 0 {
-			return
-		}
-		if n == 0 {
-			n = mheap_.arena_used - b
-		}
-	}
-	for i := uintptr(0); i < n; i += ptrSize {
-		// Find bits for this word.
-		var bits uintptr
-		if ptrmask != nil {
-			// dense mask (stack or data)
-			bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * typeBitsWidth)) & typeMask
-		} else {
-			// Check if we have reached end of span.
-			// n is an overestimate of the size of the object.
-			if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] {
-				break
-			}
-
-			bits = uintptr(hbits.typeBits())
-			if i > 0 && (hbits.isBoundary() || bits == typeDead) {
-				break // reached beginning of the next object
-			}
-			hbits = hbits.next()
-		}
-
-		if bits <= typeScalar { // typeScalar, typeDead, typeScalarMarked
-			continue
-		}
-
-		if bits&typePointer != typePointer {
-			print("gc checkmarkphase=", checkmarkphase, " b=", hex(b), " ptrmask=", ptrmask, "\n")
-			throw("unexpected garbage collection bits")
-		}
-
-		obj := *(*uintptr)(unsafe.Pointer(b + i))
-
-		// At this point we have extracted the next potential pointer.
-		// Check if it points into heap.
-		if obj == 0 || obj < arena_start || obj >= arena_used {
-			continue
-		}
-
-		if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && checkmarkphase {
-			checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
-		}
-
-		// Mark the object.
-		if obj, hbits := heapBitsForObject(obj); obj != 0 {
-			greyobject(obj, b, i, hbits, gcw)
-		}
-	}
-}
-
-// scanblock scans b as scanobject would.
-// If the gcphase is GCscan, scanblock performs additional checks.
-//go:nowritebarrier
-func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWorkProducer) {
-	// Use local copies of original parameters, so that a stack trace
-	// due to one of the throws below shows the original block
-	// base and extent.
-	b := b0
-	n := n0
-
-	// ptrmask can have 2 possible values:
-	// 1. nil - obtain pointer mask from GC bitmap.
-	// 2. pointer to a compact mask (for stacks and data).
-
-	scanobject(b, n, ptrmask, gcw)
-	if gcphase == _GCscan {
-		if inheap(b) && ptrmask == nil {
-			// b is in heap, we are in GCscan so there should be a ptrmask.
-			throw("scanblock: In GCscan phase and inheap is true.")
-		}
-	}
-}
-
-// gcDrain scans objects in work buffers, blackening grey
-// objects until all work has been drained.
-//go:nowritebarrier
-func gcDrain(gcw *gcWork) {
-	if gcphase != _GCmark && gcphase != _GCmarktermination {
-		throw("scanblock phase incorrect")
-	}
+var gcdatamask bitvector
+var gcbssmask bitvector

-	for {
-		// If another proc wants a pointer, give it some.
-		if work.nwait > 0 && work.full == 0 {
-			gcw.balance()
-		}
+// heapminimum is the minimum number of bytes in the heap.
+// This cleans up the corner case of where we have a very small live set but a lot
+// of allocations and collecting every GOGC * live set is expensive.
+var heapminimum = uint64(4 << 20)

-		b := gcw.get()
-		if b == 0 {
-			// work barrier reached
-			break
-		}
-		// If the current wbuf is filled by the scan a new wbuf might be
-		// returned that could possibly hold only a single object. This
-		// could result in each iteration draining only a single object
-		// out of the wbuf passed in + a single object placed
-		// into an empty wbuf in scanobject so there could be
-		// a performance hit as we keep fetching fresh wbufs.
-		scanobject(b, 0, nil, &gcw.gcWorkProducer)
-	}
-	checknocurrentwbuf()
-}
+// Initialized from $GOGC.  GOGC=off means no GC.
+var gcpercent int32

-// gcDrainN scans n objects, blackening grey objects.
-//go:nowritebarrier
-func gcDrainN(gcw *gcWork, n int) {
-	checknocurrentwbuf()
-	for i := 0; i < n; i++ {
-		// This might be a good place to add prefetch code...
-		// if(wbuf.nobj > 4) {
-		//         PREFETCH(wbuf->obj[wbuf.nobj - 3];
-		//  }
-		b := gcw.tryGet()
-		if b == 0 {
-			return
-		}
-		scanobject(b, 0, nil, &gcw.gcWorkProducer)
+func gcinit() {
+	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
+		throw("size of Workbuf is suboptimal")
 	}
-}
-
-//go:nowritebarrier
-func markroot(desc *parfor, i uint32) {
-	var gcw gcWorkProducer
-	gcw.initFromCache()

-	// Note: if you add a case here, please also update heapdump.c:dumproots.
-	switch i {
-	case _RootData:
-		scanblock(uintptr(unsafe.Pointer(&data)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)), gcdatamask.bytedata, &gcw)
-
-	case _RootBss:
-		scanblock(uintptr(unsafe.Pointer(&bss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)), gcbssmask.bytedata, &gcw)
-
-	case _RootFinalizers:
-		for fb := allfin; fb != nil; fb = fb.alllink {
-			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw)
-		}
-
-	case _RootSpans:
-		// mark MSpan.specials
-		sg := mheap_.sweepgen
-		for spanidx := uint32(0); spanidx < uint32(len(work.spans)); spanidx++ {
-			s := work.spans[spanidx]
-			if s.state != mSpanInUse {
-				continue
-			}
-			if !checkmarkphase && s.sweepgen != sg {
-				// sweepgen was updated (+2) during non-checkmark GC pass
-				print("sweep ", s.sweepgen, " ", sg, "\n")
-				throw("gc: unswept span")
-			}
-			for sp := s.specials; sp != nil; sp = sp.next {
-				if sp.kind != _KindSpecialFinalizer {
-					continue
-				}
-				// don't mark finalized object, but scan it so we
-				// retain everything it points to.
-				spf := (*specialfinalizer)(unsafe.Pointer(sp))
-				// A finalizer can be set for an inner byte of an object, find object beginning.
-				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
-				if gcphase != _GCscan {
-					scanblock(p, s.elemsize, nil, &gcw) // scanned during mark phase
-				}
-				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0], &gcw)
-			}
-		}
-
-	case _RootFlushCaches:
-		if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
-			flushallmcaches()
-		}
-
-	default:
-		// the rest is scanning goroutine stacks
-		if uintptr(i-_RootCount) >= allglen {
-			throw("markroot: bad index")
-		}
-		gp := allgs[i-_RootCount]
-
-		// remember when we've first observed the G blocked
-		// needed only to output in traceback
-		status := readgstatus(gp) // We are not in a scan state
-		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
-			gp.waitsince = work.tstart
-		}
-
-		// Shrink a stack if not much of it is being used but not in the scan phase.
-		if gcphase == _GCmarktermination {
-			// Shrink during STW GCmarktermination phase thus avoiding
-			// complications introduced by shrinking during
-			// non-STW phases.
-			shrinkstack(gp)
-		}
-		if readgstatus(gp) == _Gdead {
-			gp.gcworkdone = true
-		} else {
-			gp.gcworkdone = false
-		}
-		restart := stopg(gp)
-
-		// goroutine will scan its own stack when it stops running.
-		// Wait until it has.
-		for readgstatus(gp) == _Grunning && !gp.gcworkdone {
-		}
-
-		// scanstack(gp) is done as part of gcphasework
-		// But to make sure we finished we need to make sure that
-		// the stack traps have all responded so drop into
-		// this while loop until they respond.
-		for !gp.gcworkdone {
-			status = readgstatus(gp)
-			if status == _Gdead {
-				gp.gcworkdone = true // scan is a noop
-				break
-			}
-			if status == _Gwaiting || status == _Grunnable {
-				restart = stopg(gp)
-			}
-		}
-		if restart {
-			restartg(gp)
-		}
-	}
-	gcw.dispose()
+	work.markfor = parforalloc(_MaxGcproc)
+	gcpercent = readgogc()
+	gcdatamask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcdata)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)))
+	gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
+	memstats.next_gc = heapminimum
 }

-//go:nowritebarrier
-func stackmapdata(stkmap *stackmap, n int32) bitvector {
-	if n < 0 || n >= stkmap.n {
-		throw("stackmapdata: index out of range")
+func setGCPercent(in int32) (out int32) {
+	lock(&mheap_.lock)
+	out = gcpercent
+	if in < 0 {
+		in = -1
 	}
-	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+31)/32*4))))}
+	gcpercent = in
+	unlock(&mheap_.lock)
+	return out
 }

-// Scan a stack frame: local variables and function arguments/results.
-//go:nowritebarrier
-func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWorkProducer) {
-
-	f := frame.fn
-	targetpc := frame.continpc
-	if targetpc == 0 {
-		// Frame is dead.
-		return
-	}
-	if _DebugGC > 1 {
-		print("scanframe ", funcname(f), "\n")
-	}
-	if targetpc != f.entry {
-		targetpc--
-	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
-	if pcdata == -1 {
-		// We do not have a valid pcdata value but there might be a
-		// stackmap for this function.  It is likely that we are looking
-		// at the function prologue, assume so and hope for the best.
-		pcdata = 0
-	}
-
-	// Scan local variables if stack frame has been allocated.
-	size := frame.varp - frame.sp
-	var minsize uintptr
-	if thechar != '6' && thechar != '8' {
-		minsize = ptrSize
-	} else {
-		minsize = 0
-	}
-	if size > minsize {
-		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
-		if stkmap == nil || stkmap.n <= 0 {
-			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
-			throw("missing stackmap")
-		}
-
-		// Locals bitmap information, scan just the pointers in locals.
-		if pcdata < 0 || pcdata >= stkmap.n {
-			// don't know where we are
-			print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
-			throw("scanframe: bad symbol table")
-		}
-		bv := stackmapdata(stkmap, pcdata)
-		size = (uintptr(bv.n) / typeBitsWidth) * ptrSize
-		scanblock(frame.varp-size, size, bv.bytedata, gcw)
-	}
+// Trigger the concurrent GC when 1/triggerratio memory is available to allocate.
+// Adjust this ratio as part of a scheme to ensure that mutators have enough
+// memory to allocate in durring a concurrent GC cycle.
+var triggerratio = int64(8)

-	// Scan arguments.
-	if frame.arglen > 0 {
-		var bv bitvector
-		if frame.argmap != nil {
-			bv = *frame.argmap
-		} else {
-			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
-			if stkmap == nil || stkmap.n <= 0 {
-				print("runtime: frame ", funcname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
-				throw("missing stackmap")
-			}
-			if pcdata < 0 || pcdata >= stkmap.n {
-				// don't know where we are
-				print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " args stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
-				throw("scanframe: bad symbol table")
-			}
-			bv = stackmapdata(stkmap, pcdata)
-		}
-		scanblock(frame.argp, uintptr(bv.n)/typeBitsWidth*ptrSize, bv.bytedata, gcw)
-	}
+// Determine whether to initiate a GC.
+// If the GC is already working no need to trigger another one.
+// This should establish a feedback loop where if the GC does not
+// have sufficient time to complete then more memory will be
+// requested from the OS increasing heap size thus allow future
+// GCs more time to complete.
+// memstat.heap_alloc and memstat.next_gc reads have benign races
+// A false negative simple does not start a GC, a false positive
+// will start a GC needlessly. Neither have correctness issues.
+func shouldtriggergc() bool {
+	return triggerratio*(int64(memstats.next_gc)-int64(memstats.heap_alloc)) <= int64(memstats.next_gc) && atomicloaduint(&bggc.working) == 0
 }

-//go:nowritebarrier
-func scanstack(gp *g) {
-	if gp.gcscanvalid {
-		return
-	}
-
-	if readgstatus(gp)&_Gscan == 0 {
-		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
-		throw("scanstack - bad status")
-	}
-
-	switch readgstatus(gp) &^ _Gscan {
-	default:
-		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-		throw("mark - bad status")
-	case _Gdead:
-		return
-	case _Grunning:
-		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-		throw("scanstack: goroutine not stopped")
-	case _Grunnable, _Gsyscall, _Gwaiting:
-		// ok
-	}
-
-	if gp == getg() {
-		throw("can't scan our own stack")
-	}
-	mp := gp.m
-	if mp != nil && mp.helpgc != 0 {
-		throw("can't scan gchelper stack")
-	}
-
-	var gcw gcWorkProducer
-	gcw.initFromCache()
-	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
-		// Pick up gcw as free variable so gentraceback and friends can
-		// keep the same signature.
-		scanframeworker(frame, unused, &gcw)
-		return true
-	}
-	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
-	tracebackdefers(gp, scanframe, nil)
-	gcw.disposeToCache()
-	gp.gcscanvalid = true
-}
+var work workdata

-// Shade the object if it isn't already.
-// The object is not nil and known to be in the heap.
-//go:nowritebarrier
-func shade(b uintptr) {
-	if !inheap(b) {
-		throw("shade: passed an address not in the heap")
-	}
-	if obj, hbits := heapBitsForObject(b); obj != 0 {
-		// TODO: this would be a great place to put a check to see
-		// if we are harvesting and if we are then we should
-		// figure out why there is a call to shade when the
-		// harvester thinks we are in a STW.
-		// if atomicload(&harvestingwbufs) == uint32(1) {
-		//	// Throw here to discover write barriers
-		//	// being executed during a STW.
-		//	throw("shade during harvest")
-		// }
-
-		var gcw gcWorkProducer
-		greyobject(obj, 0, 0, hbits, &gcw)
-		// This is part of the write barrier so put the wbuf back.
-		if gcphase == _GCmarktermination {
-			gcw.dispose()
-		} else {
-			// If we added any pointers to the gcw, then
-			// currentwbuf must be nil because 1)
-			// greyobject got its wbuf from currentwbuf
-			// and 2) shade runs on the systemstack, so
-			// we're still on the same M.  If either of
-			// these becomes no longer true, we need to
-			// rethink this.
-			gcw.disposeToCache()
-		}
-	}
-}
+type workdata struct {
+	full    uint64                // lock-free list of full blocks workbuf
+	empty   uint64                // lock-free list of empty blocks workbuf
+	partial uint64                // lock-free list of partially filled blocks workbuf
+	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+	nproc   uint32
+	tstart  int64
+	nwait   uint32
+	ndone   uint32
+	alldone note
+	markfor *parfor

-// gchelpwork does a small bounded amount of gc work. The purpose is to
-// shorten the time (as measured by allocations) spent doing a concurrent GC.
-// The number of mutator calls is roughly propotional to the number of allocations
-// made by that mutator. This slows down the allocation while speeding up the GC.
-//go:nowritebarrier
-func gchelpwork() {
-	switch gcphase {
-	default:
-		throw("gcphasework in bad gcphase")
-	case _GCoff, _GCquiesce, _GCstw:
-		// No work.
-	case _GCsweep:
-		// We could help by calling sweepone to sweep a single span.
-		// _ = sweepone()
-	case _GCscan:
-		// scan the stack, mark the objects, put pointers in work buffers
-		// hanging off the P where this is being run.
-		// scanstack(gp)
-	case _GCmark:
-		// Get a full work buffer and empty it.
-		// drain your own currentwbuf first in the hopes that it will
-		// be more cache friendly.
-		var gcw gcWork
-		gcw.initFromCache()
-		const n = len(workbuf{}.obj)
-		gcDrainN(&gcw, n) // drain upto one buffer's worth of objects
-		gcw.dispose()
-	case _GCmarktermination:
-		// We should never be here since the world is stopped.
-		// All available mark work will be emptied before returning.
-		throw("gcphasework in bad gcphase")
-	}
+	// Copy of mheap.allspans for marker or sweeper.
+	spans []*mspan
 }

-// The gp has been moved to a GC safepoint. GC phase specific
-// work is done here.
-//go:nowritebarrier
-func gcphasework(gp *g) {
-	switch gcphase {
-	default:
-		throw("gcphasework in bad gcphase")
-	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
-		// No work.
-	case _GCscan:
-		// scan the stack, mark the objects, put pointers in work buffers
-		// hanging off the P where this is being run.
-		// Indicate that the scan is valid until the goroutine runs again
-		scanstack(gp)
-	case _GCmark:
-		// No work.
-	case _GCmarktermination:
-		scanstack(gp)
-		// All available mark work will be emptied before returning.
-	}
-	gp.gcworkdone = true
+// GC runs a garbage collection.
+func GC() {
+	gogc(2)
 }

-// Returns only when span s has been swept.
-//go:nowritebarrier
-func mSpan_EnsureSwept(s *mspan) {
-	// Caller must disable preemption.
-	// Otherwise when this function returns the span can become unswept again
-	// (if GC is triggered on another goroutine).
-	_g_ := getg()
-	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
-		throw("MSpan_EnsureSwept: m is not locked")
-	}
+// force = 0 - start concurrent GC
+// force = 1 - do STW GC regardless of current heap usage
+// force = 2 - go STW GC and eager sweep
+func gogc(force int32) {
+	// The gc is turned off (via enablegc) until the bootstrap has completed.
+	// Also, malloc gets called in the guts of a number of libraries that might be
+	// holding locks. To avoid deadlocks during stoptheworld, don't bother
+	// trying to run gc while holding a lock. The next mallocgc without a lock
+	// will do the gc instead.

-	sg := mheap_.sweepgen
-	if atomicload(&s.sweepgen) == sg {
-		return
-	}
-	// The caller must be sure that the span is a MSpanInUse span.
-	if cas(&s.sweepgen, sg-2, sg-1) {
-		mSpan_Sweep(s, false)
+	mp := acquirem()
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || !memstats.enablegc || panicking != 0 || gcpercent < 0 {
+		releasem(mp)
 		return
 	}
-	// unfortunate condition, and we don't have efficient means to wait
-	for atomicload(&s.sweepgen) != sg {
-		osyield()
-	}
-}
-
-// Sweep frees or collects finalizers for blocks not marked in the mark phase.
-// It clears the mark bits in preparation for the next GC round.
-// Returns true if the span was returned to heap.
-// If preserve=true, don't return it to heap nor relink in MCentral lists;
-// caller takes care of it.
-//TODO go:nowritebarrier
-func mSpan_Sweep(s *mspan, preserve bool) bool {
-	if checkmarkphase {
-		throw("MSpan_Sweep: checkmark only runs in STW and after the sweep")
-	}
-
-	// It's critical that we enter this function with preemption disabled,
-	// GC must not start while we are in the middle of this function.
-	_g_ := getg()
-	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
-		throw("MSpan_Sweep: m is not locked")
-	}
-	sweepgen := mheap_.sweepgen
-	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-		throw("MSpan_Sweep: bad span state")
-	}
-
-	if trace.enabled {
-		traceGCSweepStart()
-	}
-
-	cl := s.sizeclass
-	size := s.elemsize
-	res := false
-	nfree := 0
-
-	var head, end gclinkptr
+	releasem(mp)
+	mp = nil

-	c := _g_.m.mcache
-	sweepgenset := false
-
-	// Mark any free objects in this span so we don't collect them.
-	for link := s.freelist; link.ptr() != nil; link = link.ptr().next {
-		heapBitsForAddr(uintptr(link)).setMarkedNonAtomic()
-	}
-
-	// Unlink & free special records for any objects we're about to free.
-	specialp := &s.specials
-	special := *specialp
-	for special != nil {
-		// A finalizer can be set for an inner byte of an object, find object beginning.
-		p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
-		hbits := heapBitsForAddr(p)
-		if !hbits.isMarked() {
-			// Find the exact byte for which the special was setup
-			// (as opposed to object beginning).
-			p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
-			// about to free object: splice out special record
-			y := special
-			special = special.next
-			*specialp = special
-			if !freespecial(y, unsafe.Pointer(p), size, false) {
-				// stop freeing of object if it has a finalizer
-				hbits.setMarkedNonAtomic()
-			}
-		} else {
-			// object is still live: keep special record
-			specialp = &special.next
-			special = *specialp
+	if force == 0 {
+		lock(&bggc.lock)
+		if !bggc.started {
+			bggc.working = 1
+			bggc.started = true
+			go backgroundgc()
+		} else if bggc.working == 0 {
+			bggc.working = 1
+			ready(bggc.g)
 		}
+		unlock(&bggc.lock)
+	} else {
+		gcwork(force)
 	}
-
-	// Sweep through n objects of given size starting at p.
-	// This thread owns the span now, so it can manipulate
-	// the block bitmap without atomic operations.
-
-	size, n, _ := s.layout()
-	heapBitsSweepSpan(s.base(), size, n, func(p uintptr) {
-		// At this point we know that we are looking at garbage object
-		// that needs to be collected.
-		if debug.allocfreetrace != 0 {
-			tracefree(unsafe.Pointer(p), size)
-		}
-
-		// Reset to allocated+noscan.
-		if cl == 0 {
-			// Free large span.
-			if preserve {
-				throw("can't preserve large span")
-			}
-			heapBitsForSpan(p).clearSpan(s.layout())
-			s.needzero = 1
-
-			// important to set sweepgen before returning it to heap
-			atomicstore(&s.sweepgen, sweepgen)
-			sweepgenset = true
-
-			// NOTE(rsc,dvyukov): The original implementation of efence
-			// in CL 22060046 used SysFree instead of SysFault, so that
-			// the operating system would eventually give the memory
-			// back to us again, so that an efence program could run
-			// longer without running out of memory. Unfortunately,
-			// calling SysFree here without any kind of adjustment of the
-			// heap data structures means that when the memory does
-			// come back to us, we have the wrong metadata for it, either in
-			// the MSpan structures or in the garbage collection bitmap.
-			// Using SysFault here means that the program will run out of
-			// memory fairly quickly in efence mode, but at least it won't
-			// have mysterious crashes due to confused memory reuse.
-			// It should be possible to switch back to SysFree if we also
-			// implement and then call some kind of MHeap_DeleteSpan.
-			if debug.efence > 0 {
-				s.limit = 0 // prevent mlookup from finding this span
-				sysFault(unsafe.Pointer(p), size)
-			} else {
-				mHeap_Free(&mheap_, s, 1)
-			}
-			c.local_nlargefree++
-			c.local_largefree += size
-			reduction := int64(size) * int64(gcpercent+100) / 100
-			if int64(memstats.next_gc)-reduction > int64(heapminimum) {
-				xadd64(&memstats.next_gc, -reduction)
-			} else {
-				atomicstore64(&memstats.next_gc, heapminimum)
-			}
-			res = true
-		} else {
-			// Free small object.
-			if size > 2*ptrSize {
-				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
-			} else if size > ptrSize {
-				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
-			}
-			if head.ptr() == nil {
-				head = gclinkptr(p)
-			} else {
-				end.ptr().next = gclinkptr(p)
-			}
-			end = gclinkptr(p)
-			end.ptr().next = gclinkptr(0x0bade5)
-			nfree++
-		}
-	})
-
-	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
-	// because of the potential for a concurrent free/SetFinalizer.
-	// But we need to set it before we make the span available for allocation
-	// (return it to heap or mcentral), because allocation code assumes that a
-	// span is already swept if available for allocation.
-	if !sweepgenset && nfree == 0 {
-		// The span must be in our exclusive ownership until we update sweepgen,
-		// check for potential races.
-		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
-			throw("MSpan_Sweep: bad span state after sweep")
-		}
-		atomicstore(&s.sweepgen, sweepgen)
-	}
-	if nfree > 0 {
-		c.local_nsmallfree[cl] += uintptr(nfree)
-		c.local_cachealloc -= intptr(uintptr(nfree) * size)
-		reduction := int64(nfree) * int64(size) * int64(gcpercent+100) / 100
-		if int64(memstats.next_gc)-reduction > int64(heapminimum) {
-			xadd64(&memstats.next_gc, -reduction)
-		} else {
-			atomicstore64(&memstats.next_gc, heapminimum)
-		}
-		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head, end, preserve)
-		// MCentral_FreeSpan updates sweepgen
-	}
-	if trace.enabled {
-		traceGCSweepDone()
-		traceNextGC()
-	}
-	return res
-}
-
-// State of background sweep.
-// Protected by gclock.
-type sweepdata struct {
-	g       *g
-	parked  bool
-	started bool
-
-	spanidx uint32 // background sweeper position
-
-	nbgsweep    uint32
-	npausesweep uint32
 }

-var sweep sweepdata
-
 // State of the background concurrent GC goroutine.
 var bggc struct {
 	lock    mutex
@@ -1027,237 +271,161 @@ var bggc struct {
 	started bool
 }

-// sweeps one span
-// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
-//go:nowritebarrier
-func sweepone() uintptr {
-	_g_ := getg()
-
-	// increment locks to ensure that the goroutine is not preempted
-	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
-	_g_.m.locks++
-	sg := mheap_.sweepgen
+// backgroundgc is running in a goroutine and does the concurrent GC work.
+// bggc holds the state of the backgroundgc.
+func backgroundgc() {
+	bggc.g = getg()
 	for {
-		idx := xadd(&sweep.spanidx, 1) - 1
-		if idx >= uint32(len(work.spans)) {
-			mheap_.sweepdone = 1
-			_g_.m.locks--
-			return ^uintptr(0)
-		}
-		s := work.spans[idx]
-		if s.state != mSpanInUse {
-			s.sweepgen = sg
-			continue
-		}
-		if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
-			continue
-		}
-		npages := s.npages
-		if !mSpan_Sweep(s, false) {
-			npages = 0
-		}
-		_g_.m.locks--
-		return npages
+		gcwork(0)
+		lock(&bggc.lock)
+		bggc.working = 0
+		goparkunlock(&bggc.lock, "Concurrent GC wait", traceEvGoBlock)
 	}
 }

-//go:nowritebarrier
-func gosweepone() uintptr {
-	var ret uintptr
-	systemstack(func() {
-		ret = sweepone()
-	})
-	return ret
-}
+func gcwork(force int32) {

-//go:nowritebarrier
-func gosweepdone() bool {
-	return mheap_.sweepdone != 0
-}
-
-//go:nowritebarrier
-func gchelper() {
-	_g_ := getg()
-	_g_.m.traceback = 2
-	gchelperstart()
-
-	if trace.enabled {
-		traceGCScanStart()
-	}
+	semacquire(&worldsema, false)

-	// parallel mark for over GC roots
-	parfordo(work.markfor)
-	if gcphase != _GCscan {
-		var gcw gcWork
-		gcDrain(&gcw) // blocks in getfull
-		gcw.dispose()
+	// Pick up the remaining unswept/not being swept spans concurrently
+	for gosweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
 	}

-	if trace.enabled {
-		traceGCScanDone()
-	}
+	// Ok, we're doing it!  Stop everybody else

-	nproc := work.nproc // work.nproc can change right after we increment work.ndone
-	if xadd(&work.ndone, +1) == nproc-1 {
-		notewakeup(&work.alldone)
+	mp := acquirem()
+	mp.preemptoff = "gcing"
+	releasem(mp)
+	gctimer.count++
+	if force == 0 {
+		gctimer.cycle.sweepterm = nanotime()
 	}
-	_g_.m.traceback = 0
-}

-//go:nowritebarrier
-func cachestats() {
-	for i := 0; ; i++ {
-		p := allp[i]
-		if p == nil {
-			break
-		}
-		c := p.mcache
-		if c == nil {
-			continue
+	if trace.enabled {
+		traceGoSched()
+		traceGCStart()
+	}
+
+	// Pick up the remaining unswept/not being swept spans before we STW
+	for gosweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+	}
+	systemstack(stoptheworld)
+	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
+	if force == 0 {            // Do as much work concurrently as possible
+		gcphase = _GCscan
+		systemstack(starttheworld)
+		gctimer.cycle.scan = nanotime()
+		// Do a concurrent heap scan before we stop the world.
+		systemstack(gcscan_m)
+		gctimer.cycle.installmarkwb = nanotime()
+		systemstack(stoptheworld)
+		systemstack(gcinstallmarkwb)
+		systemstack(harvestwbufs)
+		systemstack(starttheworld)
+		gctimer.cycle.mark = nanotime()
+		systemstack(gcmark_m)
+		gctimer.cycle.markterm = nanotime()
+		systemstack(stoptheworld)
+		systemstack(gcinstalloffwb_m)
+	} else {
+		// For non-concurrent GC (force != 0) g stack have not been scanned so
+		// set gcscanvalid such that mark termination scans all stacks.
+		// No races here since we are in a STW phase.
+		for _, gp := range allgs {
+			gp.gcworkdone = false  // set to true in gcphasework
+			gp.gcscanvalid = false // stack has not been scanned
 		}
-		purgecachedstats(c)
 	}
-}

-//go:nowritebarrier
-func flushallmcaches() {
-	for i := 0; ; i++ {
-		p := allp[i]
-		if p == nil {
-			break
-		}
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		mCache_ReleaseAll(c)
-		stackcache_clear(c)
+	startTime := nanotime()
+	if mp != acquirem() {
+		throw("gogc: rescheduled")
 	}
-}

-//go:nowritebarrier
-func updatememstats(stats *gcstats) {
-	if stats != nil {
-		*stats = gcstats{}
+	clearpools()
+
+	// Run gc on the g0 stack.  We do this so that the g stack
+	// we're currently running on will no longer change.  Cuts
+	// the root set down a bit (g0 stacks are not scanned, and
+	// we don't need to scan gc's internal state).  We also
+	// need to switch to g0 so we can shrink the stack.
+	n := 1
+	if debug.gctrace > 1 {
+		n = 2
 	}
-	for mp := allm; mp != nil; mp = mp.alllink {
-		if stats != nil {
-			src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
-			dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
-			for i, v := range src {
-				dst[i] += v
-			}
-			mp.gcstats = gcstats{}
+	eagersweep := force >= 2
+	for i := 0; i < n; i++ {
+		if i > 0 {
+			// refresh start time if doing a second GC
+			startTime = nanotime()
 		}
+		// switch to g0, call gc, then switch back
+		systemstack(func() {
+			gc_m(startTime, eagersweep)
+		})
 	}

-	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
-	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
-	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
-		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
-
-	// Calculate memory allocator stats.
-	// During program execution we only count number of frees and amount of freed memory.
-	// Current number of alive object in the heap and amount of alive heap memory
-	// are calculated by scanning all spans.
-	// Total number of mallocs is calculated as number of frees plus number of alive objects.
-	// Similarly, total amount of allocated memory is calculated as amount of freed memory
-	// plus amount of alive heap memory.
-	memstats.alloc = 0
-	memstats.total_alloc = 0
-	memstats.nmalloc = 0
-	memstats.nfree = 0
-	for i := 0; i < len(memstats.by_size); i++ {
-		memstats.by_size[i].nmalloc = 0
-		memstats.by_size[i].nfree = 0
-	}
+	systemstack(func() {
+		gccheckmark_m(startTime, eagersweep)
+	})

-	// Flush MCache's to MCentral.
-	systemstack(flushallmcaches)
+	if trace.enabled {
+		traceGCDone()
+		traceGoStart()
+	}

-	// Aggregate local stats.
-	cachestats()
+	// all done
+	mp.preemptoff = ""

-	// Scan all spans and count number of alive objects.
-	lock(&mheap_.lock)
-	for i := uint32(0); i < mheap_.nspan; i++ {
-		s := h_allspans[i]
-		if s.state != mSpanInUse {
-			continue
-		}
-		if s.sizeclass == 0 {
-			memstats.nmalloc++
-			memstats.alloc += uint64(s.elemsize)
-		} else {
-			memstats.nmalloc += uint64(s.ref)
-			memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref)
-			memstats.alloc += uint64(s.ref) * uint64(s.elemsize)
-		}
+	if force == 0 {
+		gctimer.cycle.sweep = nanotime()
 	}
-	unlock(&mheap_.lock)

-	// Aggregate by size class.
-	smallfree := uint64(0)
-	memstats.nfree = mheap_.nlargefree
-	for i := 0; i < len(memstats.by_size); i++ {
-		memstats.nfree += mheap_.nsmallfree[i]
-		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
-		smallfree += uint64(mheap_.nsmallfree[i]) * uint64(class_to_size[i])
+	semrelease(&worldsema)
+
+	if force == 0 {
+		if gctimer.verbose > 1 {
+			GCprinttimes()
+		} else if gctimer.verbose > 0 {
+			calctimes() // ignore result
+		}
 	}
-	memstats.nfree += memstats.tinyallocs
-	memstats.nmalloc += memstats.nfree

-	// Calculate derived stats.
-	memstats.total_alloc = uint64(memstats.alloc) + uint64(mheap_.largefree) + smallfree
-	memstats.heap_alloc = memstats.alloc
-	memstats.heap_objects = memstats.nmalloc - memstats.nfree
-}
+	systemstack(starttheworld)

-// heapminimum is the minimum number of bytes in the heap.
-// This cleans up the corner case of where we have a very small live set but a lot
-// of allocations and collecting every GOGC * live set is expensive.
-var heapminimum = uint64(4 << 20)
+	releasem(mp)
+	mp = nil

-func gcinit() {
-	if unsafe.Sizeof(workbuf{}) != _WorkbufSize {
-		throw("size of Workbuf is suboptimal")
+	// now that gc is done, kick off finalizer thread if needed
+	if !concurrentSweep {
+		// give the queued finalizers, if any, a chance to run
+		Gosched()
 	}
-
-	work.markfor = parforalloc(_MaxGcproc)
-	gcpercent = readgogc()
-	gcdatamask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcdata)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)))
-	gcbssmask = unrollglobgcprog((*byte)(unsafe.Pointer(&gcbss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)))
-	memstats.next_gc = heapminimum
 }

-// Called from malloc.go using systemstack, stopping and starting the world handled in caller.
+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
 //go:nowritebarrier
-func gc_m(start_time int64, eagersweep bool) {
-	_g_ := getg()
-	gp := _g_.m.curg
-	casgstatus(gp, _Grunning, _Gwaiting)
-	gp.waitreason = "garbage collection"
-
-	gc(start_time, eagersweep)
-	casgstatus(gp, _Gwaiting, _Grunning)
+func gcinstalloffwb_m() {
+	gcphase = _GCoff
 }

+// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
+// all go routines see the new barrier.
 //go:nowritebarrier
-func initCheckmarks() {
-	for _, s := range work.spans {
-		if s.state == _MSpanInUse {
-			heapBitsForSpan(s.base()).initCheckmarkSpan(s.layout())
-		}
-	}
+func gcinstallmarkwb() {
+	gcphase = _GCmark
 }

-func clearCheckmarks() {
-	for _, s := range work.spans {
-		if s.state == _MSpanInUse {
-			heapBitsForSpan(s.base()).clearCheckmarkSpan(s.layout())
-		}
-	}
+// Mark all objects that are known about.
+// This is the concurrent mark phase.
+//go:nowritebarrier
+func gcmark_m() {
+	gcDrain(nil)
+	// TODO add another harvestwbuf and reset work.nwait=0, work.ndone=0, and work.nproc=1
+	// and repeat the above gcDrain.
 }

 // Called from malloc.go using systemstack.
@@ -1280,90 +448,16 @@ func gccheckmark_m(startTime int64, eagersweep bool) {
 	gc_m(startTime, eagersweep) // turns off checkmarkphase + calls clearcheckmarkbits
 }

+// Called from malloc.go using systemstack, stopping and starting the world handled in caller.
 //go:nowritebarrier
-func finishsweep_m() {
-	// The world is stopped so we should be able to complete the sweeps
-	// quickly.
-	for sweepone() != ^uintptr(0) {
-		sweep.npausesweep++
-	}
-
-	// There may be some other spans being swept concurrently that
-	// we need to wait for. If finishsweep_m is done with the world stopped
-	// this code is not required.
-	sg := mheap_.sweepgen
-	for _, s := range work.spans {
-		if s.sweepgen != sg && s.state == _MSpanInUse {
-			mSpan_EnsureSwept(s)
-		}
-	}
-}
-
-// Scan all of the stacks, greying (or graying if in America) the referents
-// but not blackening them since the mark write barrier isn't installed.
-//go:nowritebarrier
-func gcscan_m() {
+func gc_m(start_time int64, eagersweep bool) {
 	_g_ := getg()
+	gp := _g_.m.curg
+	casgstatus(gp, _Grunning, _Gwaiting)
+	gp.waitreason = "garbage collection"

-	// Grab the g that called us and potentially allow rescheduling.
-	// This allows it to be scanned like other goroutines.
-	mastergp := _g_.m.curg
-	casgstatus(mastergp, _Grunning, _Gwaiting)
-	mastergp.waitreason = "garbage collection scan"
-
-	// Span sweeping has been done by finishsweep_m.
-	// Long term we will want to make this goroutine runnable
-	// by placing it onto a scanenqueue state and then calling
-	// runtime·restartg(mastergp) to make it Grunnable.
-	// At the bottom we will want to return this p back to the scheduler.
-
-	// Prepare flag indicating that the scan has not been completed.
-	lock(&allglock)
-	local_allglen := allglen
-	for i := uintptr(0); i < local_allglen; i++ {
-		gp := allgs[i]
-		gp.gcworkdone = false  // set to true in gcphasework
-		gp.gcscanvalid = false // stack has not been scanned
-	}
-	unlock(&allglock)
-
-	work.nwait = 0
-	work.ndone = 0
-	work.nproc = 1 // For now do not do this in parallel.
-	//	ackgcphase is not needed since we are not scanning running goroutines.
-	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), false, markroot)
-	parfordo(work.markfor)
-
-	lock(&allglock)
-	// Check that gc work is done.
-	for i := uintptr(0); i < local_allglen; i++ {
-		gp := allgs[i]
-		if !gp.gcworkdone {
-			throw("scan missed a g")
-		}
-	}
-	unlock(&allglock)
-
-	casgstatus(mastergp, _Gwaiting, _Grunning)
-	// Let the g that called us continue to run.
-}
-
-// Mark all objects that are known about.
-// This is the concurrent mark phase.
-//go:nowritebarrier
-func gcmark_m() {
-	var gcw gcWork
-	gcDrain(&gcw)
-	gcw.dispose()
-	// TODO add another harvestwbuf and reset work.nwait=0, work.ndone=0, and work.nproc=1
-	// and repeat the above gcDrain.
-}
-
-// For now this must be bracketed with a stoptheworld and a starttheworld to ensure
-// all go routines see the new barrier.
-//go:nowritebarrier
-func gcinstalloffwb_m() {
-	gcphase = _GCoff
+	gc(start_time, eagersweep)
+	casgstatus(gp, _Gwaiting, _Grunning)
 }

 // STW is in effect at this point.
@@ -1573,68 +667,89 @@ func gc(start_time int64, eagersweep bool) {
 	}
 }

-func readmemstats_m(stats *MemStats) {
-	updatememstats(nil)
-
-	// Size of the trailing by_size array differs between Go and C,
-	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
-	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
+// Hooks for other packages

-	// Stack numbers are part of the heap numbers, separate those out for user consumption
-	stats.StackSys = stats.StackInuse
-	stats.HeapInuse -= stats.StackInuse
-	stats.HeapSys -= stats.StackInuse
+//go:linkname runtime_debug_freeOSMemory runtime/debug.freeOSMemory
+func runtime_debug_freeOSMemory() {
+	gogc(2) // force GC and do eager sweep
+	systemstack(scavenge_m)
 }

-//go:linkname readGCStats runtime/debug.readGCStats
-func readGCStats(pauses *[]uint64) {
-	systemstack(func() {
-		readGCStats_m(pauses)
-	})
+var poolcleanup func()
+
+//go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
+func sync_runtime_registerPoolCleanup(f func()) {
+	poolcleanup = f
 }

-func readGCStats_m(pauses *[]uint64) {
-	p := *pauses
-	// Calling code in runtime/debug should make the slice large enough.
-	if cap(p) < len(memstats.pause_ns)+3 {
-		throw("short slice passed to readGCStats")
+func clearpools() {
+	// clear sync.Pools
+	if poolcleanup != nil {
+		poolcleanup()
 	}

-	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
-	lock(&mheap_.lock)
+	for _, p := range &allp {
+		if p == nil {
+			break
+		}
+		// clear tinyalloc pool
+		if c := p.mcache; c != nil {
+			c.tiny = nil
+			c.tinyoffset = 0
+
+			// disconnect cached list before dropping it on the floor,
+			// so that a dangling ref to one entry does not pin all of them.
+			var sg, sgnext *sudog
+			for sg = c.sudogcache; sg != nil; sg = sgnext {
+				sgnext = sg.next
+				sg.next = nil
+			}
+			c.sudogcache = nil
+		}

-	n := memstats.numgc
-	if n > uint32(len(memstats.pause_ns)) {
-		n = uint32(len(memstats.pause_ns))
+		// clear defer pools
+		for i := range p.deferpool {
+			// disconnect cached list before dropping it on the floor,
+			// so that a dangling ref to one entry does not pin all of them.
+			var d, dlink *_defer
+			for d = p.deferpool[i]; d != nil; d = dlink {
+				dlink = d.link
+				d.link = nil
+			}
+			p.deferpool[i] = nil
+		}
 	}
+}
+
+// Timing

-	// The pause buffer is circular. The most recent pause is at
-	// pause_ns[(numgc-1)%len(pause_ns)], and then backward
-	// from there to go back farther in time. We deliver the times
-	// most recent first (in p[0]).
-	p = p[:cap(p)]
-	for i := uint32(0); i < n; i++ {
-		j := (memstats.numgc - 1 - i) % uint32(len(memstats.pause_ns))
-		p[i] = memstats.pause_ns[j]
-		p[n+i] = memstats.pause_end[j]
+//go:nowritebarrier
+func gchelper() {
+	_g_ := getg()
+	_g_.m.traceback = 2
+	gchelperstart()
+
+	if trace.enabled {
+		traceGCScanStart()
 	}

-	p[n+n] = memstats.last_gc
-	p[n+n+1] = uint64(memstats.numgc)
-	p[n+n+2] = memstats.pause_total_ns
-	unlock(&mheap_.lock)
-	*pauses = p[:n+n+3]
-}
+	// parallel mark for over GC roots
+	parfordo(work.markfor)
+	if gcphase != _GCscan {
+		var gcw gcWork
+		gcDrain(&gcw) // blocks in getfull
+		gcw.dispose()
+	}

-func setGCPercent(in int32) (out int32) {
-	lock(&mheap_.lock)
-	out = gcpercent
-	if in < 0 {
-		in = -1
+	if trace.enabled {
+		traceGCScanDone()
 	}
-	gcpercent = in
-	unlock(&mheap_.lock)
-	return out
+
+	nproc := work.nproc // work.nproc can change right after we increment work.ndone
+	if xadd(&work.ndone, +1) == nproc-1 {
+		notewakeup(&work.alldone)
+	}
+	_g_.m.traceback = 0
 }

 func gchelperstart() {
@@ -1648,7 +763,106 @@ func gchelperstart() {
 	}
 }

-func unixnanotime() int64 {
-	sec, nsec := time_now()
-	return sec*1e9 + int64(nsec)
+// gcchronograph holds timer information related to GC phases
+// max records the maximum time spent in each GC phase since GCstarttimes.
+// total records the total time spent in each GC phase since GCstarttimes.
+// cycle records the absolute time (as returned by nanoseconds()) that each GC phase last started at.
+type gcchronograph struct {
+	count    int64
+	verbose  int64
+	maxpause int64
+	max      gctimes
+	total    gctimes
+	cycle    gctimes
+}
+
+// gctimes records the time in nanoseconds of each phase of the concurrent GC.
+type gctimes struct {
+	sweepterm     int64 // stw
+	scan          int64
+	installmarkwb int64 // stw
+	mark          int64
+	markterm      int64 // stw
+	sweep         int64
+}
+
+var gctimer gcchronograph
+
+// GCstarttimes initializes the gc times. All previous times are lost.
+func GCstarttimes(verbose int64) {
+	gctimer = gcchronograph{verbose: verbose}
+}
+
+// GCendtimes stops the gc timers.
+func GCendtimes() {
+	gctimer.verbose = 0
+}
+
+// calctimes converts gctimer.cycle into the elapsed times, updates gctimer.total
+// and updates gctimer.max with the max pause time.
+func calctimes() gctimes {
+	var times gctimes
+
+	var max = func(a, b int64) int64 {
+		if a > b {
+			return a
+		}
+		return b
+	}
+
+	times.sweepterm = gctimer.cycle.scan - gctimer.cycle.sweepterm
+	gctimer.total.sweepterm += times.sweepterm
+	gctimer.max.sweepterm = max(gctimer.max.sweepterm, times.sweepterm)
+	gctimer.maxpause = max(gctimer.maxpause, gctimer.max.sweepterm)
+
+	times.scan = gctimer.cycle.installmarkwb - gctimer.cycle.scan
+	gctimer.total.scan += times.scan
+	gctimer.max.scan = max(gctimer.max.scan, times.scan)
+
+	times.installmarkwb = gctimer.cycle.mark - gctimer.cycle.installmarkwb
+	gctimer.total.installmarkwb += times.installmarkwb
+	gctimer.max.installmarkwb = max(gctimer.max.installmarkwb, times.installmarkwb)
+	gctimer.maxpause = max(gctimer.maxpause, gctimer.max.installmarkwb)
+
+	times.mark = gctimer.cycle.markterm - gctimer.cycle.mark
+	gctimer.total.mark += times.mark
+	gctimer.max.mark = max(gctimer.max.mark, times.mark)
+
+	times.markterm = gctimer.cycle.sweep - gctimer.cycle.markterm
+	gctimer.total.markterm += times.markterm
+	gctimer.max.markterm = max(gctimer.max.markterm, times.markterm)
+	gctimer.maxpause = max(gctimer.maxpause, gctimer.max.markterm)
+
+	return times
+}
+
+// GCprinttimes prints latency information in nanoseconds about various
+// phases in the GC. The information for each phase includes the maximum pause
+// and total time since the most recent call to GCstarttimes as well as
+// the information from the most recent Concurent GC cycle. Calls from the
+// application to runtime.GC() are ignored.
+func GCprinttimes() {
+	if gctimer.verbose == 0 {
+		println("GC timers not enabled")
+		return
+	}
+
+	// Explicitly put times on the heap so printPhase can use it.
+	times := new(gctimes)
+	*times = calctimes()
+	cycletime := gctimer.cycle.sweep - gctimer.cycle.sweepterm
+	pause := times.sweepterm + times.installmarkwb + times.markterm
+	gomaxprocs := GOMAXPROCS(-1)
+
+	printlock()
+	print("GC: #", gctimer.count, " ", cycletime, "ns @", gctimer.cycle.sweepterm, " pause=", pause, " maxpause=", gctimer.maxpause, " goroutines=", allglen, " gomaxprocs=", gomaxprocs, "\n")
+	printPhase := func(label string, get func(*gctimes) int64, procs int) {
+		print("GC:     ", label, " ", get(times), "ns\tmax=", get(&gctimer.max), "\ttotal=", get(&gctimer.total), "\tprocs=", procs, "\n")
+	}
+	printPhase("sweep term:", func(t *gctimes) int64 { return t.sweepterm }, gomaxprocs)
+	printPhase("scan:      ", func(t *gctimes) int64 { return t.scan }, 1)
+	printPhase("install wb:", func(t *gctimes) int64 { return t.installmarkwb }, gomaxprocs)
+	printPhase("mark:      ", func(t *gctimes) int64 { return t.mark }, 1)
+	printPhase("mark term: ", func(t *gctimes) int64 { return t.markterm }, gomaxprocs)
+	printunlock()
 }
--- a/src/runtime/mgc0.go
+++ b/src/runtime/mgc0.go
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import _ "unsafe" // for go:linkname
-
-//go:linkname runtime_debug_freeOSMemory runtime/debug.freeOSMemory
-func runtime_debug_freeOSMemory() {
-	gogc(2) // force GC and do eager sweep
-	systemstack(scavenge_m)
-}
-
-var poolcleanup func()
-
-//go:linkname sync_runtime_registerPoolCleanup sync.runtime_registerPoolCleanup
-func sync_runtime_registerPoolCleanup(f func()) {
-	poolcleanup = f
-}
-
-func clearpools() {
-	// clear sync.Pools
-	if poolcleanup != nil {
-		poolcleanup()
-	}
-
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
-		// clear tinyalloc pool
-		if c := p.mcache; c != nil {
-			c.tiny = nil
-			c.tinyoffset = 0
-
-			// disconnect cached list before dropping it on the floor,
-			// so that a dangling ref to one entry does not pin all of them.
-			var sg, sgnext *sudog
-			for sg = c.sudogcache; sg != nil; sg = sgnext {
-				sgnext = sg.next
-				sg.next = nil
-			}
-			c.sudogcache = nil
-		}
-
-		// clear defer pools
-		for i := range p.deferpool {
-			// disconnect cached list before dropping it on the floor,
-			// so that a dangling ref to one entry does not pin all of them.
-			var d, dlink *_defer
-			for d = p.deferpool[i]; d != nil; d = dlink {
-				dlink = d.link
-				d.link = nil
-			}
-			p.deferpool[i] = nil
-		}
-	}
-}
-
-// backgroundgc is running in a goroutine and does the concurrent GC work.
-// bggc holds the state of the backgroundgc.
-func backgroundgc() {
-	bggc.g = getg()
-	for {
-		gcwork(0)
-		lock(&bggc.lock)
-		bggc.working = 0
-		goparkunlock(&bggc.lock, "Concurrent GC wait", traceEvGoBlock)
-	}
-}
-
-func bgsweep() {
-	sweep.g = getg()
-	for {
-		for gosweepone() != ^uintptr(0) {
-			sweep.nbgsweep++
-			Gosched()
-		}
-		lock(&gclock)
-		if !gosweepdone() {
-			// This can happen if a GC runs between
-			// gosweepone returning ^0 above
-			// and the lock being acquired.
-			unlock(&gclock)
-			continue
-		}
-		sweep.parked = true
-		goparkunlock(&gclock, "GC sweep wait", traceEvGoBlock)
-	}
-}
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: marking and scanning
+
+package runtime
+
+import "unsafe"
+
+// Scan all of the stacks, greying (or graying if in America) the referents
+// but not blackening them since the mark write barrier isn't installed.
+//go:nowritebarrier
+func gcscan_m() {
+	_g_ := getg()
+
+	// Grab the g that called us and potentially allow rescheduling.
+	// This allows it to be scanned like other goroutines.
+	mastergp := _g_.m.curg
+	casgstatus(mastergp, _Grunning, _Gwaiting)
+	mastergp.waitreason = "garbage collection scan"
+
+	// Span sweeping has been done by finishsweep_m.
+	// Long term we will want to make this goroutine runnable
+	// by placing it onto a scanenqueue state and then calling
+	// runtime·restartg(mastergp) to make it Grunnable.
+	// At the bottom we will want to return this p back to the scheduler.
+
+	// Prepare flag indicating that the scan has not been completed.
+	lock(&allglock)
+	local_allglen := allglen
+	for i := uintptr(0); i < local_allglen; i++ {
+		gp := allgs[i]
+		gp.gcworkdone = false  // set to true in gcphasework
+		gp.gcscanvalid = false // stack has not been scanned
+	}
+	unlock(&allglock)
+
+	work.nwait = 0
+	work.ndone = 0
+	work.nproc = 1 // For now do not do this in parallel.
+	//	ackgcphase is not needed since we are not scanning running goroutines.
+	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), false, markroot)
+	parfordo(work.markfor)
+
+	lock(&allglock)
+	// Check that gc work is done.
+	for i := uintptr(0); i < local_allglen; i++ {
+		gp := allgs[i]
+		if !gp.gcworkdone {
+			throw("scan missed a g")
+		}
+	}
+	unlock(&allglock)
+
+	casgstatus(mastergp, _Gwaiting, _Grunning)
+	// Let the g that called us continue to run.
+}
+
+// ptrmask for an allocation containing a single pointer.
+var oneptr = [...]uint8{typePointer}
+
+//go:nowritebarrier
+func markroot(desc *parfor, i uint32) {
+	var gcw gcWorkProducer
+	gcw.initFromCache()
+
+	// Note: if you add a case here, please also update heapdump.c:dumproots.
+	switch i {
+	case _RootData:
+		scanblock(uintptr(unsafe.Pointer(&data)), uintptr(unsafe.Pointer(&edata))-uintptr(unsafe.Pointer(&data)), gcdatamask.bytedata, &gcw)
+
+	case _RootBss:
+		scanblock(uintptr(unsafe.Pointer(&bss)), uintptr(unsafe.Pointer(&ebss))-uintptr(unsafe.Pointer(&bss)), gcbssmask.bytedata, &gcw)
+
+	case _RootFinalizers:
+		for fb := allfin; fb != nil; fb = fb.alllink {
+			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw)
+		}
+
+	case _RootSpans:
+		// mark MSpan.specials
+		sg := mheap_.sweepgen
+		for spanidx := uint32(0); spanidx < uint32(len(work.spans)); spanidx++ {
+			s := work.spans[spanidx]
+			if s.state != mSpanInUse {
+				continue
+			}
+			if !checkmarkphase && s.sweepgen != sg {
+				// sweepgen was updated (+2) during non-checkmark GC pass
+				print("sweep ", s.sweepgen, " ", sg, "\n")
+				throw("gc: unswept span")
+			}
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+				if gcphase != _GCscan {
+					scanblock(p, s.elemsize, nil, &gcw) // scanned during mark phase
+				}
+				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptr[0], &gcw)
+			}
+		}
+
+	case _RootFlushCaches:
+		if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
+			flushallmcaches()
+		}
+
+	default:
+		// the rest is scanning goroutine stacks
+		if uintptr(i-_RootCount) >= allglen {
+			throw("markroot: bad index")
+		}
+		gp := allgs[i-_RootCount]
+
+		// remember when we've first observed the G blocked
+		// needed only to output in traceback
+		status := readgstatus(gp) // We are not in a scan state
+		if (status == _Gwaiting || status == _Gsyscall) && gp.waitsince == 0 {
+			gp.waitsince = work.tstart
+		}
+
+		// Shrink a stack if not much of it is being used but not in the scan phase.
+		if gcphase == _GCmarktermination {
+			// Shrink during STW GCmarktermination phase thus avoiding
+			// complications introduced by shrinking during
+			// non-STW phases.
+			shrinkstack(gp)
+		}
+		if readgstatus(gp) == _Gdead {
+			gp.gcworkdone = true
+		} else {
+			gp.gcworkdone = false
+		}
+		restart := stopg(gp)
+
+		// goroutine will scan its own stack when it stops running.
+		// Wait until it has.
+		for readgstatus(gp) == _Grunning && !gp.gcworkdone {
+		}
+
+		// scanstack(gp) is done as part of gcphasework
+		// But to make sure we finished we need to make sure that
+		// the stack traps have all responded so drop into
+		// this while loop until they respond.
+		for !gp.gcworkdone {
+			status = readgstatus(gp)
+			if status == _Gdead {
+				gp.gcworkdone = true // scan is a noop
+				break
+			}
+			if status == _Gwaiting || status == _Grunnable {
+				restart = stopg(gp)
+			}
+		}
+		if restart {
+			restartg(gp)
+		}
+	}
+	gcw.dispose()
+}
+
+// gchelpwork does a small bounded amount of gc work. The purpose is to
+// shorten the time (as measured by allocations) spent doing a concurrent GC.
+// The number of mutator calls is roughly propotional to the number of allocations
+// made by that mutator. This slows down the allocation while speeding up the GC.
+//go:nowritebarrier
+func gchelpwork() {
+	switch gcphase {
+	default:
+		throw("gcphasework in bad gcphase")
+	case _GCoff, _GCquiesce, _GCstw:
+		// No work.
+	case _GCsweep:
+		// We could help by calling sweepone to sweep a single span.
+		// _ = sweepone()
+	case _GCscan:
+		// scan the stack, mark the objects, put pointers in work buffers
+		// hanging off the P where this is being run.
+		// scanstack(gp)
+	case _GCmark:
+		// Get a full work buffer and empty it.
+		// drain your own currentwbuf first in the hopes that it will
+		// be more cache friendly.
+		var gcw gcWork
+		gcw.initFromCache()
+		const n = len(workbuf{}.obj)
+		gcDrainN(&gcw, n) // drain upto one buffer's worth of objects
+		gcw.dispose()
+	case _GCmarktermination:
+		// We should never be here since the world is stopped.
+		// All available mark work will be emptied before returning.
+		throw("gcphasework in bad gcphase")
+	}
+}
+
+// The gp has been moved to a GC safepoint. GC phase specific
+// work is done here.
+//go:nowritebarrier
+func gcphasework(gp *g) {
+	switch gcphase {
+	default:
+		throw("gcphasework in bad gcphase")
+	case _GCoff, _GCquiesce, _GCstw, _GCsweep:
+		// No work.
+	case _GCscan:
+		// scan the stack, mark the objects, put pointers in work buffers
+		// hanging off the P where this is being run.
+		// Indicate that the scan is valid until the goroutine runs again
+		scanstack(gp)
+	case _GCmark:
+		// No work.
+	case _GCmarktermination:
+		scanstack(gp)
+		// All available mark work will be emptied before returning.
+	}
+	gp.gcworkdone = true
+}
+
+//go:nowritebarrier
+func scanstack(gp *g) {
+	if gp.gcscanvalid {
+		return
+	}
+
+	if readgstatus(gp)&_Gscan == 0 {
+		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
+		throw("scanstack - bad status")
+	}
+
+	switch readgstatus(gp) &^ _Gscan {
+	default:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("mark - bad status")
+	case _Gdead:
+		return
+	case _Grunning:
+		print("runtime: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+		throw("scanstack: goroutine not stopped")
+	case _Grunnable, _Gsyscall, _Gwaiting:
+		// ok
+	}
+
+	if gp == getg() {
+		throw("can't scan our own stack")
+	}
+	mp := gp.m
+	if mp != nil && mp.helpgc != 0 {
+		throw("can't scan gchelper stack")
+	}
+
+	var gcw gcWorkProducer
+	gcw.initFromCache()
+	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
+		// Pick up gcw as free variable so gentraceback and friends can
+		// keep the same signature.
+		scanframeworker(frame, unused, &gcw)
+		return true
+	}
+	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
+	tracebackdefers(gp, scanframe, nil)
+	gcw.disposeToCache()
+	gp.gcscanvalid = true
+}
+
+// Scan a stack frame: local variables and function arguments/results.
+//go:nowritebarrier
+func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWorkProducer) {
+
+	f := frame.fn
+	targetpc := frame.continpc
+	if targetpc == 0 {
+		// Frame is dead.
+		return
+	}
+	if _DebugGC > 1 {
+		print("scanframe ", funcname(f), "\n")
+	}
+	if targetpc != f.entry {
+		targetpc--
+	}
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+	if pcdata == -1 {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function.  It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0
+	}
+
+	// Scan local variables if stack frame has been allocated.
+	size := frame.varp - frame.sp
+	var minsize uintptr
+	if thechar != '6' && thechar != '8' {
+		minsize = ptrSize
+	} else {
+		minsize = 0
+	}
+	if size > minsize {
+		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+		if stkmap == nil || stkmap.n <= 0 {
+			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+			throw("missing stackmap")
+		}
+
+		// Locals bitmap information, scan just the pointers in locals.
+		if pcdata < 0 || pcdata >= stkmap.n {
+			// don't know where we are
+			print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
+			throw("scanframe: bad symbol table")
+		}
+		bv := stackmapdata(stkmap, pcdata)
+		size = (uintptr(bv.n) / typeBitsWidth) * ptrSize
+		scanblock(frame.varp-size, size, bv.bytedata, gcw)
+	}
+
+	// Scan arguments.
+	if frame.arglen > 0 {
+		var bv bitvector
+		if frame.argmap != nil {
+			bv = *frame.argmap
+		} else {
+			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+			if stkmap == nil || stkmap.n <= 0 {
+				print("runtime: frame ", funcname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
+				throw("missing stackmap")
+			}
+			if pcdata < 0 || pcdata >= stkmap.n {
+				// don't know where we are
+				print("runtime: pcdata is ", pcdata, " and ", stkmap.n, " args stack map entries for ", funcname(f), " (targetpc=", targetpc, ")\n")
+				throw("scanframe: bad symbol table")
+			}
+			bv = stackmapdata(stkmap, pcdata)
+		}
+		scanblock(frame.argp, uintptr(bv.n)/typeBitsWidth*ptrSize, bv.bytedata, gcw)
+	}
+}
+
+// gcDrain scans objects in work buffers (starting with wbuf), blackening grey
+// objects until all work buffers have been drained.
+//go:nowritebarrier
+func gcDrain(gcw *gcWork) {
+	if gcphase != _GCmark && gcphase != _GCmarktermination {
+		throw("scanblock phase incorrect")
+	}
+
+	for {
+		// If another proc wants a pointer, give it some.
+		if work.nwait > 0 && work.full == 0 {
+			gcw.balance()
+		}
+
+		b := gcw.get()
+		if b == 0 {
+			// work barrier reached
+			break
+		}
+		// If the current wbuf is filled by the scan a new wbuf might be
+		// returned that could possibly hold only a single object. This
+		// could result in each iteration draining only a single object
+		// out of the wbuf passed in + a single object placed
+		// into an empty wbuf in scanobject so there could be
+		// a performance hit as we keep fetching fresh wbufs.
+		scanobject(b, 0, nil, &gcw.gcWorkProducer)
+	}
+	checknocurrentwbuf()
+}
+
+// gcDrainN scans n objects, blackening grey objects.
+//go:nowritebarrier
+func gcDrainN(gcw *gcWork, n int) {
+	checknocurrentwbuf()
+	for i := 0; i < n; i++ {
+		// This might be a good place to add prefetch code...
+		// if(wbuf.nobj > 4) {
+		//         PREFETCH(wbuf->obj[wbuf.nobj - 3];
+		//  }
+		b := gcw.tryGet()
+		if b == 0 {
+			return
+		}
+		scanobject(b, 0, nil, &gcw.gcWorkProducer)
+	}
+}
+
+// scanblock scans b as scanobject would.
+// If the gcphase is GCscan, scanblock performs additional checks.
+//go:nowritebarrier
+func scanblock(b0, n0 uintptr, ptrmask *uint8, gcw *gcWorkProducer) {
+	// Use local copies of original parameters, so that a stack trace
+	// due to one of the throws below shows the original block
+	// base and extent.
+	b := b0
+	n := n0
+
+	// ptrmask can have 2 possible values:
+	// 1. nil - obtain pointer mask from GC bitmap.
+	// 2. pointer to a compact mask (for stacks and data).
+
+	scanobject(b, n, ptrmask, gcw)
+	if gcphase == _GCscan {
+		if inheap(b) && ptrmask == nil {
+			// b is in heap, we are in GCscan so there should be a ptrmask.
+			throw("scanblock: In GCscan phase and inheap is true.")
+		}
+	}
+}
+
+// Scan the object b of size n, adding pointers to wbuf.
+// Return possibly new wbuf to use.
+// If ptrmask != nil, it specifies where pointers are in b.
+// If ptrmask == nil, the GC bitmap should be consulted.
+// In this case, n may be an overestimate of the size; the GC bitmap
+// must also be used to make sure the scan stops at the end of b.
+//go:nowritebarrier
+func scanobject(b, n uintptr, ptrmask *uint8, gcw *gcWorkProducer) {
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
+
+	// Find bits of the beginning of the object.
+	var hbits heapBits
+	if ptrmask == nil {
+		b, hbits = heapBitsForObject(b)
+		if b == 0 {
+			return
+		}
+		if n == 0 {
+			n = mheap_.arena_used - b
+		}
+	}
+	for i := uintptr(0); i < n; i += ptrSize {
+		// Find bits for this word.
+		var bits uintptr
+		if ptrmask != nil {
+			// dense mask (stack or data)
+			bits = (uintptr(*(*byte)(add(unsafe.Pointer(ptrmask), (i/ptrSize)/4))) >> (((i / ptrSize) % 4) * typeBitsWidth)) & typeMask
+		} else {
+			// Check if we have reached end of span.
+			// n is an overestimate of the size of the object.
+			if (b+i)%_PageSize == 0 && h_spans[(b-arena_start)>>_PageShift] != h_spans[(b+i-arena_start)>>_PageShift] {
+				break
+			}
+
+			bits = uintptr(hbits.typeBits())
+			if i > 0 && (hbits.isBoundary() || bits == typeDead) {
+				break // reached beginning of the next object
+			}
+			hbits = hbits.next()
+		}
+
+		if bits <= typeScalar { // typeScalar, typeDead, typeScalarMarked
+			continue
+		}
+
+		if bits&typePointer != typePointer {
+			print("gc checkmarkphase=", checkmarkphase, " b=", hex(b), " ptrmask=", ptrmask, "\n")
+			throw("unexpected garbage collection bits")
+		}
+
+		obj := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// At this point we have extracted the next potential pointer.
+		// Check if it points into heap.
+		if obj == 0 || obj < arena_start || obj >= arena_used {
+			continue
+		}
+
+		if mheap_.shadow_enabled && debug.wbshadow >= 2 && debug.gccheckmark > 0 && checkmarkphase {
+			checkwbshadow((*uintptr)(unsafe.Pointer(b + i)))
+		}
+
+		// Mark the object.
+		if obj, hbits := heapBitsForObject(obj); obj != 0 {
+			greyobject(obj, b, i, hbits, gcw)
+		}
+	}
+}
+
+// Shade the object if it isn't already.
+// The object is not nil and known to be in the heap.
+//go:nowritebarrier
+func shade(b uintptr) {
+	if !inheap(b) {
+		throw("shade: passed an address not in the heap")
+	}
+	if obj, hbits := heapBitsForObject(b); obj != 0 {
+		// TODO: this would be a great place to put a check to see
+		// if we are harvesting and if we are then we should
+		// figure out why there is a call to shade when the
+		// harvester thinks we are in a STW.
+		// if atomicload(&harvestingwbufs) == uint32(1) {
+		//	// Throw here to discover write barriers
+		//	// being executed during a STW.
+		//	throw("shade during harvest")
+		// }
+
+		var gcw gcWorkProducer
+		greyobject(obj, 0, 0, hbits, &gcw)
+		// This is part of the write barrier so put the wbuf back.
+		if gcphase == _GCmarktermination {
+			gcw.dispose()
+		} else {
+			// If we added any pointers to the gcw, then
+			// currentwbuf must be nil because 1)
+			// greyobject got its wbuf from currentwbuf
+			// and 2) shade runs on the systemstack, so
+			// we're still on the same M.  If either of
+			// these becomes no longer true, we need to
+			// rethink this.
+			gcw.disposeToCache()
+		}
+	}
+}
+
+// obj is the start of an object with mark mbits.
+// If it isn't already marked, mark it and enqueue into workbuf.
+// Return possibly new workbuf to use.
+// base and off are for debugging only and could be removed.
+//go:nowritebarrier
+func greyobject(obj, base, off uintptr, hbits heapBits, gcw *gcWorkProducer) {
+	// obj should be start of allocation, and so must be at least pointer-aligned.
+	if obj&(ptrSize-1) != 0 {
+		throw("greyobject: obj not pointer-aligned")
+	}
+
+	if checkmarkphase {
+		if !hbits.isMarked() {
+			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
+			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
+
+			// Dump the source (base) object
+
+			kb := base >> _PageShift
+			xb := kb
+			xb -= mheap_.arena_start >> _PageShift
+			sb := h_spans[xb]
+			printlock()
+			print("runtime:greyobject Span: base=", hex(base), " kb=", hex(kb))
+			if sb == nil {
+				print(" sb=nil\n")
+			} else {
+				print(" sb.start*_PageSize=", hex(sb.start*_PageSize), " sb.limit=", hex(sb.limit), " sb.sizeclass=", sb.sizeclass, " sb.elemsize=", sb.elemsize, "\n")
+				// base is (a pointer to) the source object holding the reference to object. Create a pointer to each of the fields
+				// fields in base and print them out as hex values.
+				for i := 0; i < int(sb.elemsize/ptrSize); i++ {
+					print(" *(base+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(base + uintptr(i)*ptrSize))), "\n")
+				}
+			}
+
+			// Dump the object
+
+			k := obj >> _PageShift
+			x := k
+			x -= mheap_.arena_start >> _PageShift
+			s := h_spans[x]
+			print("runtime:greyobject Span: obj=", hex(obj), " k=", hex(k))
+			if s == nil {
+				print(" s=nil\n")
+			} else {
+				print(" s.start=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
+				// NOTE(rsc): This code is using s.sizeclass as an approximation of the
+				// number of pointer-sized words in an object. Perhaps not what was intended.
+				for i := 0; i < int(s.sizeclass); i++ {
+					print(" *(obj+", i*ptrSize, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)*ptrSize))), "\n")
+				}
+			}
+			throw("checkmark found unmarked object")
+		}
+		if !hbits.isCheckmarked() {
+			return
+		}
+		hbits.setCheckmarked()
+		if !hbits.isCheckmarked() {
+			throw("setCheckmarked and isCheckmarked disagree")
+		}
+	} else {
+		// If marked we have nothing to do.
+		if hbits.isMarked() {
+			return
+		}
+
+		// Each byte of GC bitmap holds info for two words.
+		// Might be racing with other updates, so use atomic update always.
+		// We used to be clever here and use a non-atomic update in certain
+		// cases, but it's not worth the risk.
+		hbits.setMarked()
+	}
+
+	if !checkmarkphase && hbits.typeBits() == typeDead {
+		return // noscan object
+	}
+
+	// Queue the obj for scanning. The PREFETCH(obj) logic has been removed but
+	// seems like a nice optimization that can be added back in.
+	// There needs to be time between the PREFETCH and the use.
+	// Previously we put the obj in an 8 element buffer that is drained at a rate
+	// to give the PREFETCH time to do its work.
+	// Use of PREFETCHNTA might be more appropriate than PREFETCH
+
+	gcw.put(obj)
+}
+
+// When in GCmarkterminate phase we allocate black.
+//go:nowritebarrier
+func gcmarknewobject_m(obj uintptr) {
+	if gcphase != _GCmarktermination {
+		throw("marking new object while not in mark termination phase")
+	}
+	if checkmarkphase { // The world should be stopped so this should not happen.
+		throw("gcmarknewobject called while doing checkmark")
+	}
+
+	heapBitsForAddr(obj).setMarked()
+}
+
+// Checkmarking
+
+// To help debug the concurrent GC we remark with the world
+// stopped ensuring that any object encountered has their normal
+// mark bit set. To do this we use an orthogonal bit
+// pattern to indicate the object is marked. The following pattern
+// uses the upper two bits in the object's bounday nibble.
+// 01: scalar  not marked
+// 10: pointer not marked
+// 11: pointer     marked
+// 00: scalar      marked
+// Xoring with 01 will flip the pattern from marked to unmarked and vica versa.
+// The higher bit is 1 for pointers and 0 for scalars, whether the object
+// is marked or not.
+// The first nibble no longer holds the typeDead pattern indicating that the
+// there are no more pointers in the object. This information is held
+// in the second nibble.
+
+// When marking an object if the bool checkmarkphase is true one uses the above
+// encoding, otherwise one uses the bitMarked bit in the lower two bits
+// of the nibble.
+var checkmarkphase = false
+
+//go:nowritebarrier
+func initCheckmarks() {
+	for _, s := range work.spans {
+		if s.state == _MSpanInUse {
+			heapBitsForSpan(s.base()).initCheckmarkSpan(s.layout())
+		}
+	}
+}
+
+func clearCheckmarks() {
+	for _, s := range work.spans {
+		if s.state == _MSpanInUse {
+			heapBitsForSpan(s.base()).clearCheckmarkSpan(s.layout())
+		}
+	}
+}
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: sweeping
+
+package runtime
+
+import "unsafe"
+
+var sweep sweepdata
+
+// State of background sweep.
+// Protected by gclock.
+type sweepdata struct {
+	g       *g
+	parked  bool
+	started bool
+
+	spanidx uint32 // background sweeper position
+
+	nbgsweep    uint32
+	npausesweep uint32
+}
+
+var gclock mutex
+
+//go:nowritebarrier
+func finishsweep_m() {
+	// The world is stopped so we should be able to complete the sweeps
+	// quickly.
+	for sweepone() != ^uintptr(0) {
+		sweep.npausesweep++
+	}
+
+	// There may be some other spans being swept concurrently that
+	// we need to wait for. If finishsweep_m is done with the world stopped
+	// this code is not required.
+	sg := mheap_.sweepgen
+	for _, s := range work.spans {
+		if s.sweepgen != sg && s.state == _MSpanInUse {
+			mSpan_EnsureSwept(s)
+		}
+	}
+}
+
+func bgsweep() {
+	sweep.g = getg()
+	for {
+		for gosweepone() != ^uintptr(0) {
+			sweep.nbgsweep++
+			Gosched()
+		}
+		lock(&gclock)
+		if !gosweepdone() {
+			// This can happen if a GC runs between
+			// gosweepone returning ^0 above
+			// and the lock being acquired.
+			unlock(&gclock)
+			continue
+		}
+		sweep.parked = true
+		goparkunlock(&gclock, "GC sweep wait", traceEvGoBlock)
+	}
+}
+
+// sweeps one span
+// returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep
+//go:nowritebarrier
+func sweepone() uintptr {
+	_g_ := getg()
+
+	// increment locks to ensure that the goroutine is not preempted
+	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
+	_g_.m.locks++
+	sg := mheap_.sweepgen
+	for {
+		idx := xadd(&sweep.spanidx, 1) - 1
+		if idx >= uint32(len(work.spans)) {
+			mheap_.sweepdone = 1
+			_g_.m.locks--
+			return ^uintptr(0)
+		}
+		s := work.spans[idx]
+		if s.state != mSpanInUse {
+			s.sweepgen = sg
+			continue
+		}
+		if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
+			continue
+		}
+		npages := s.npages
+		if !mSpan_Sweep(s, false) {
+			npages = 0
+		}
+		_g_.m.locks--
+		return npages
+	}
+}
+
+//go:nowritebarrier
+func gosweepone() uintptr {
+	var ret uintptr
+	systemstack(func() {
+		ret = sweepone()
+	})
+	return ret
+}
+
+//go:nowritebarrier
+func gosweepdone() bool {
+	return mheap_.sweepdone != 0
+}
+
+// Returns only when span s has been swept.
+//go:nowritebarrier
+func mSpan_EnsureSwept(s *mspan) {
+	// Caller must disable preemption.
+	// Otherwise when this function returns the span can become unswept again
+	// (if GC is triggered on another goroutine).
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("MSpan_EnsureSwept: m is not locked")
+	}
+
+	sg := mheap_.sweepgen
+	if atomicload(&s.sweepgen) == sg {
+		return
+	}
+	// The caller must be sure that the span is a MSpanInUse span.
+	if cas(&s.sweepgen, sg-2, sg-1) {
+		mSpan_Sweep(s, false)
+		return
+	}
+	// unfortunate condition, and we don't have efficient means to wait
+	for atomicload(&s.sweepgen) != sg {
+		osyield()
+	}
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in MCentral lists;
+// caller takes care of it.
+//TODO go:nowritebarrier
+func mSpan_Sweep(s *mspan, preserve bool) bool {
+	if checkmarkphase {
+		throw("MSpan_Sweep: checkmark only runs in STW and after the sweep")
+	}
+
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("MSpan_Sweep: m is not locked")
+	}
+	sweepgen := mheap_.sweepgen
+	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("MSpan_Sweep: bad span state")
+	}
+
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	cl := s.sizeclass
+	size := s.elemsize
+	res := false
+	nfree := 0
+
+	var head, end gclinkptr
+
+	c := _g_.m.mcache
+	sweepgenset := false
+
+	// Mark any free objects in this span so we don't collect them.
+	for link := s.freelist; link.ptr() != nil; link = link.ptr().next {
+		heapBitsForAddr(uintptr(link)).setMarkedNonAtomic()
+	}
+
+	// Unlink & free special records for any objects we're about to free.
+	specialp := &s.specials
+	special := *specialp
+	for special != nil {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
+		hbits := heapBitsForAddr(p)
+		if !hbits.isMarked() {
+			// Find the exact byte for which the special was setup
+			// (as opposed to object beginning).
+			p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
+			// about to free object: splice out special record
+			y := special
+			special = special.next
+			*specialp = special
+			if !freespecial(y, unsafe.Pointer(p), size, false) {
+				// stop freeing of object if it has a finalizer
+				hbits.setMarkedNonAtomic()
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special.next
+			special = *specialp
+		}
+	}
+
+	// Sweep through n objects of given size starting at p.
+	// This thread owns the span now, so it can manipulate
+	// the block bitmap without atomic operations.
+
+	size, n, _ := s.layout()
+	heapBitsSweepSpan(s.base(), size, n, func(p uintptr) {
+		// At this point we know that we are looking at garbage object
+		// that needs to be collected.
+		if debug.allocfreetrace != 0 {
+			tracefree(unsafe.Pointer(p), size)
+		}
+
+		// Reset to allocated+noscan.
+		if cl == 0 {
+			// Free large span.
+			if preserve {
+				throw("can't preserve large span")
+			}
+			heapBitsForSpan(p).clearSpan(s.layout())
+			s.needzero = 1
+
+			// important to set sweepgen before returning it to heap
+			atomicstore(&s.sweepgen, sweepgen)
+			sweepgenset = true
+
+			// NOTE(rsc,dvyukov): The original implementation of efence
+			// in CL 22060046 used SysFree instead of SysFault, so that
+			// the operating system would eventually give the memory
+			// back to us again, so that an efence program could run
+			// longer without running out of memory. Unfortunately,
+			// calling SysFree here without any kind of adjustment of the
+			// heap data structures means that when the memory does
+			// come back to us, we have the wrong metadata for it, either in
+			// the MSpan structures or in the garbage collection bitmap.
+			// Using SysFault here means that the program will run out of
+			// memory fairly quickly in efence mode, but at least it won't
+			// have mysterious crashes due to confused memory reuse.
+			// It should be possible to switch back to SysFree if we also
+			// implement and then call some kind of MHeap_DeleteSpan.
+			if debug.efence > 0 {
+				s.limit = 0 // prevent mlookup from finding this span
+				sysFault(unsafe.Pointer(p), size)
+			} else {
+				mHeap_Free(&mheap_, s, 1)
+			}
+			c.local_nlargefree++
+			c.local_largefree += size
+			reduction := int64(size) * int64(gcpercent+100) / 100
+			if int64(memstats.next_gc)-reduction > int64(heapminimum) {
+				xadd64(&memstats.next_gc, -reduction)
+			} else {
+				atomicstore64(&memstats.next_gc, heapminimum)
+			}
+			res = true
+		} else {
+			// Free small object.
+			if size > 2*ptrSize {
+				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
+			} else if size > ptrSize {
+				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
+			}
+			if head.ptr() == nil {
+				head = gclinkptr(p)
+			} else {
+				end.ptr().next = gclinkptr(p)
+			}
+			end = gclinkptr(p)
+			end.ptr().next = gclinkptr(0x0bade5)
+			nfree++
+		}
+	})
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	if !sweepgenset && nfree == 0 {
+		// The span must be in our exclusive ownership until we update sweepgen,
+		// check for potential races.
+		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
+			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+			throw("MSpan_Sweep: bad span state after sweep")
+		}
+		atomicstore(&s.sweepgen, sweepgen)
+	}
+	if nfree > 0 {
+		c.local_nsmallfree[cl] += uintptr(nfree)
+		c.local_cachealloc -= intptr(uintptr(nfree) * size)
+		reduction := int64(nfree) * int64(size) * int64(gcpercent+100) / 100
+		if int64(memstats.next_gc)-reduction > int64(heapminimum) {
+			xadd64(&memstats.next_gc, -reduction)
+		} else {
+			atomicstore64(&memstats.next_gc, heapminimum)
+		}
+		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head, end, preserve)
+		// MCentral_FreeSpan updates sweepgen
+	}
+	if trace.enabled {
+		traceGCSweepDone()
+		traceNextGC()
+	}
+	return res
+}
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -4,7 +4,72 @@

 // Page heap.
 //
-// See malloc.h for overview.
+// See malloc.go for overview.
+
+package runtime
+
+import "unsafe"
+
+// Main malloc heap.
+// The heap itself is the "free[]" and "large" arrays,
+// but all the other global data is here too.
+type mheap struct {
+	lock      mutex
+	free      [_MaxMHeapList]mspan // free lists of given length
+	freelarge mspan                // free lists length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mspan // busy lists of large objects of given length
+	busylarge mspan                // busy lists of large objects length >= _MaxMHeapList
+	allspans  **mspan              // all spans out there
+	gcspans   **mspan              // copy of allspans referenced by gc marker or sweeper
+	nspan     uint32
+	sweepgen  uint32 // sweep generation, see comment in mspan
+	sweepdone uint32 // all spans are swept
+
+	// span lookup
+	spans        **mspan
+	spans_mapped uintptr
+
+	// range of addresses we might see in the heap
+	bitmap         uintptr
+	bitmap_mapped  uintptr
+	arena_start    uintptr
+	arena_used     uintptr
+	arena_end      uintptr
+	arena_reserved bool
+
+	// write barrier shadow data+heap.
+	// 64-bit systems only, enabled by GODEBUG=wbshadow=1.
+	shadow_enabled  bool    // shadow should be updated and checked
+	shadow_reserved bool    // shadow memory is reserved
+	shadow_heap     uintptr // heap-addr + shadow_heap = shadow heap addr
+	shadow_data     uintptr // data-addr + shadow_data = shadow data addr
+	data_start      uintptr // start of shadowed data addresses
+	data_end        uintptr // end of shadowed data addresses
+
+	// central free lists for small size classes.
+	// the padding makes sure that the MCentrals are
+	// spaced CacheLineSize bytes apart, so that each MCentral.lock
+	// gets its own cache line.
+	central [_NumSizeClasses]struct {
+		mcentral mcentral
+		pad      [_CacheLineSize]byte
+	}
+
+	spanalloc             fixalloc // allocator for span*
+	cachealloc            fixalloc // allocator for mcache*
+	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
+	specialprofilealloc   fixalloc // allocator for specialprofile*
+	speciallock           mutex    // lock for sepcial record allocators.
+
+	// Malloc stats.
+	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+}
+
+var mheap_ mheap
+
+// An MSpan is a run of pages.
 //
 // When a MSpan is in the heap free list, state == MSpanFree
 // and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
@@ -12,9 +77,55 @@
 // When a MSpan is allocated, state == MSpanInUse or MSpanStack
 // and heapmap(i) == span for all s->start <= i < s->start+s->npages.

-package runtime
-
-import "unsafe"
+// Every MSpan is in one doubly-linked list,
+// either one of the MHeap's free lists or one of the
+// MCentral's span lists.  We use empty MSpan structures as list heads.
+
+const (
+	_MSpanInUse = iota // allocated for garbage collected heap
+	_MSpanStack        // allocated for use by stack allocator
+	_MSpanFree
+	_MSpanListHead
+	_MSpanDead
+)
+
+type mspan struct {
+	next     *mspan    // in a span linked list
+	prev     *mspan    // in a span linked list
+	start    pageID    // starting page number
+	npages   uintptr   // number of pages in span
+	freelist gclinkptr // list of free objects
+	// sweep generation:
+	// if sweepgen == h->sweepgen - 2, the span needs sweeping
+	// if sweepgen == h->sweepgen - 1, the span is currently being swept
+	// if sweepgen == h->sweepgen, the span is swept and ready to use
+	// h->sweepgen is incremented by 2 after every GC
+	sweepgen    uint32
+	ref         uint16   // capacity - number of objects in freelist
+	sizeclass   uint8    // size class
+	incache     bool     // being used by an mcache
+	state       uint8    // mspaninuse etc
+	needzero    uint8    // needs to be zeroed before allocation
+	elemsize    uintptr  // computed from sizeclass or from npages
+	unusedsince int64    // first time spotted by gc in mspanfree state
+	npreleased  uintptr  // number of pages released to the os
+	limit       uintptr  // end of data in span
+	speciallock mutex    // guards specials list
+	specials    *special // linked list of special records sorted by offset.
+}
+
+func (s *mspan) base() uintptr {
+	return uintptr(s.start << _PageShift)
+}
+
+func (s *mspan) layout() (size, n, total uintptr) {
+	total = s.npages << _PageShift
+	size = s.elemsize
+	if size > 0 {
+		n = total / size
+	}
+	return
+}

 var h_allspans []*mspan // TODO: make this h.allspans once mheap can be defined in Go
 var h_spans []*mspan    // TODO: make this h.spans once mheap can be defined in Go
@@ -50,6 +161,73 @@ func recordspan(vh unsafe.Pointer, p unsafe.Pointer) {
 	h.nspan = uint32(len(h_allspans))
 }

+// inheap reports whether b is a pointer into a (potentially dead) heap object.
+// It returns false for pointers into stack spans.
+//go:nowritebarrier
+func inheap(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	k := b >> _PageShift
+	x := k
+	x -= mheap_.arena_start >> _PageShift
+	s := h_spans[x]
+	if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
+		return false
+	}
+	return true
+}
+
+func mlookup(v uintptr, base *uintptr, size *uintptr, sp **mspan) int32 {
+	_g_ := getg()
+
+	_g_.m.mcache.local_nlookup++
+	if ptrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
+		// purge cache stats to prevent overflow
+		lock(&mheap_.lock)
+		purgecachedstats(_g_.m.mcache)
+		unlock(&mheap_.lock)
+	}
+
+	s := mHeap_LookupMaybe(&mheap_, unsafe.Pointer(v))
+	if sp != nil {
+		*sp = s
+	}
+	if s == nil {
+		if base != nil {
+			*base = 0
+		}
+		if size != nil {
+			*size = 0
+		}
+		return 0
+	}
+
+	p := uintptr(s.start) << _PageShift
+	if s.sizeclass == 0 {
+		// Large object.
+		if base != nil {
+			*base = p
+		}
+		if size != nil {
+			*size = s.npages << _PageShift
+		}
+		return 1
+	}
+
+	n := s.elemsize
+	if base != nil {
+		i := (uintptr(v) - uintptr(p)) / n
+		*base = p + i*n
+	}
+	if size != nil {
+		*size = n
+	}
+
+	return 1
+}
+
 // Initialize the heap.
 func mHeap_Init(h *mheap, spans_size uintptr) {
 	fixAlloc_Init(&h.spanalloc, unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
@@ -635,6 +813,21 @@ func mSpanList_InsertBack(list *mspan, span *mspan) {
 	span.prev.next = span
 }

+const (
+	_KindSpecialFinalizer = 1
+	_KindSpecialProfile   = 2
+	// Note: The finalizer special must be first because if we're freeing
+	// an object, a finalizer special will cause the freeing operation
+	// to abort, and we want to keep the other special records around
+	// if that happens.
+)
+
+type special struct {
+	next   *special // linked list in span
+	offset uint16   // span offset of object
+	kind   byte     // kind of special
+}
+
 // Adds the special record s to the list of special records for
 // the object p.  All fields of s should be filled in except for
 // offset & next, which this routine will fill in.
@@ -723,6 +916,15 @@ func removespecial(p unsafe.Pointer, kind uint8) *special {
 	return nil
 }

+// The described object has a finalizer set for it.
+type specialfinalizer struct {
+	special special
+	fn      *funcval
+	nret    uintptr
+	fint    *_type
+	ot      *ptrtype
+}
+
 // Adds a finalizer to the object p.  Returns true if it succeeded.
 func addfinalizer(p unsafe.Pointer, f *funcval, nret uintptr, fint *_type, ot *ptrtype) bool {
 	lock(&mheap_.speciallock)
@@ -755,6 +957,12 @@ func removefinalizer(p unsafe.Pointer) {
 	unlock(&mheap_.speciallock)
 }

+// The described object is being heap profiled.
+type specialprofile struct {
+	special special
+	b       *bucket
+}
+
 // Set the heap profile bucket associated with addr to b.
 func setprofilebucket(p unsafe.Pointer, b *bucket) {
 	lock(&mheap_.speciallock)

--- a/src/runtime/msize.go
+++ b/src/runtime/msize.go
@@ -27,8 +27,15 @@

 package runtime

-//var class_to_size [_NumSizeClasses]int32
-//var class_to_allocnpages [_NumSizeClasses]int32
+// Size classes.  Computed and initialized by InitSizes.
+//
+// SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
+//	1 <= sizeclass < NumSizeClasses, for n.
+//	Size class 0 is reserved to mean "not small".
+//
+// class_to_size[i] = largest size in class i
+// class_to_allocnpages[i] = number of pages to allocate when
+//	making new objects in class i

 // The SizeToClass lookup is implemented using two arrays,
 // one mapping sizes <= 1024 to their class and one mapping
@@ -38,8 +45,11 @@ package runtime
 // are 128-aligned, so the second array is indexed by the
 // size divided by 128 (rounded up).  The arrays are filled in
 // by InitSizes.
-//var size_to_class8 [1024/8 + 1]int8
-//var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
+
+var class_to_size [_NumSizeClasses]int32
+var class_to_allocnpages [_NumSizeClasses]int32
+var size_to_class8 [1024/8 + 1]int8
+var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8

 func sizeToClass(size int32) int32 {
 	if size > _MaxSmallSize {

--- a/src/runtime/mem.go
+++ b/src/runtime/mem.go
-// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2009 The Go Authors.  All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+// Memory statistics
+
 package runtime

 import "unsafe"

+// Statistics.
+// Shared with Go: if you edit this structure, also edit type MemStats in mem.go.
+type mstats struct {
+	// General statistics.
+	alloc       uint64 // bytes allocated and still in use
+	total_alloc uint64 // bytes allocated (even if freed)
+	sys         uint64 // bytes obtained from system (should be sum of xxx_sys below, no locking, approximate)
+	nlookup     uint64 // number of pointer lookups
+	nmalloc     uint64 // number of mallocs
+	nfree       uint64 // number of frees
+
+	// Statistics about malloc heap.
+	// protected by mheap.lock
+	heap_alloc    uint64 // bytes allocated and still in use
+	heap_sys      uint64 // bytes obtained from system
+	heap_idle     uint64 // bytes in idle spans
+	heap_inuse    uint64 // bytes in non-idle spans
+	heap_released uint64 // bytes released to the os
+	heap_objects  uint64 // total number of allocated objects
+
+	// Statistics about allocation of low-level fixed-size structures.
+	// Protected by FixAlloc locks.
+	stacks_inuse uint64 // this number is included in heap_inuse above
+	stacks_sys   uint64 // always 0 in mstats
+	mspan_inuse  uint64 // mspan structures
+	mspan_sys    uint64
+	mcache_inuse uint64 // mcache structures
+	mcache_sys   uint64
+	buckhash_sys uint64 // profiling bucket hash table
+	gc_sys       uint64
+	other_sys    uint64
+
+	// Statistics about garbage collector.
+	// Protected by mheap or stopping the world during GC.
+	next_gc        uint64 // next gc (in heap_alloc time)
+	last_gc        uint64 // last gc (in absolute time)
+	pause_total_ns uint64
+	pause_ns       [256]uint64 // circular buffer of recent gc pause lengths
+	pause_end      [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
+	numgc          uint32
+	enablegc       bool
+	debuggc        bool
+
+	// Statistics about allocation size classes.
+
+	by_size [_NumSizeClasses]struct {
+		size    uint32
+		nmalloc uint64
+		nfree   uint64
+	}
+
+	tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+}
+
+var memstats mstats
+
 // Note: the MemStats struct should be kept in sync with
 // struct MStats in malloc.h

@@ -95,20 +153,188 @@ func ReadMemStats(m *MemStats) {
 	gp.m.locks--
 }

-//go:linkname runtime_debug_WriteHeapDump runtime/debug.WriteHeapDump
-func runtime_debug_WriteHeapDump(fd uintptr) {
-	semacquire(&worldsema, false)
-	gp := getg()
-	gp.m.preemptoff = "write heap dump"
-	systemstack(stoptheworld)
+func readmemstats_m(stats *MemStats) {
+	updatememstats(nil)
+
+	// Size of the trailing by_size array differs between Go and C,
+	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
+
+	// Stack numbers are part of the heap numbers, separate those out for user consumption
+	stats.StackSys = stats.StackInuse
+	stats.HeapInuse -= stats.StackInuse
+	stats.HeapSys -= stats.StackInuse
+}

+//go:linkname readGCStats runtime/debug.readGCStats
+func readGCStats(pauses *[]uint64) {
 	systemstack(func() {
-		writeheapdump_m(fd)
+		readGCStats_m(pauses)
 	})
+}

-	gp.m.preemptoff = ""
-	gp.m.locks++
-	semrelease(&worldsema)
-	systemstack(starttheworld)
-	gp.m.locks--
+func readGCStats_m(pauses *[]uint64) {
+	p := *pauses
+	// Calling code in runtime/debug should make the slice large enough.
+	if cap(p) < len(memstats.pause_ns)+3 {
+		throw("short slice passed to readGCStats")
+	}
+
+	// Pass back: pauses, pause ends, last gc (absolute time), number of gc, total pause ns.
+	lock(&mheap_.lock)
+
+	n := memstats.numgc
+	if n > uint32(len(memstats.pause_ns)) {
+		n = uint32(len(memstats.pause_ns))
+	}
+
+	// The pause buffer is circular. The most recent pause is at
+	// pause_ns[(numgc-1)%len(pause_ns)], and then backward
+	// from there to go back farther in time. We deliver the times
+	// most recent first (in p[0]).
+	p = p[:cap(p)]
+	for i := uint32(0); i < n; i++ {
+		j := (memstats.numgc - 1 - i) % uint32(len(memstats.pause_ns))
+		p[i] = memstats.pause_ns[j]
+		p[n+i] = memstats.pause_end[j]
+	}
+
+	p[n+n] = memstats.last_gc
+	p[n+n+1] = uint64(memstats.numgc)
+	p[n+n+2] = memstats.pause_total_ns
+	unlock(&mheap_.lock)
+	*pauses = p[:n+n+3]
+}
+
+//go:nowritebarrier
+func updatememstats(stats *gcstats) {
+	if stats != nil {
+		*stats = gcstats{}
+	}
+	for mp := allm; mp != nil; mp = mp.alllink {
+		if stats != nil {
+			src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
+			dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
+			for i, v := range src {
+				dst[i] += v
+			}
+			mp.gcstats = gcstats{}
+		}
+	}
+
+	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
+	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
+	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
+		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
+
+	// Calculate memory allocator stats.
+	// During program execution we only count number of frees and amount of freed memory.
+	// Current number of alive object in the heap and amount of alive heap memory
+	// are calculated by scanning all spans.
+	// Total number of mallocs is calculated as number of frees plus number of alive objects.
+	// Similarly, total amount of allocated memory is calculated as amount of freed memory
+	// plus amount of alive heap memory.
+	memstats.alloc = 0
+	memstats.total_alloc = 0
+	memstats.nmalloc = 0
+	memstats.nfree = 0
+	for i := 0; i < len(memstats.by_size); i++ {
+		memstats.by_size[i].nmalloc = 0
+		memstats.by_size[i].nfree = 0
+	}
+
+	// Flush MCache's to MCentral.
+	systemstack(flushallmcaches)
+
+	// Aggregate local stats.
+	cachestats()
+
+	// Scan all spans and count number of alive objects.
+	lock(&mheap_.lock)
+	for i := uint32(0); i < mheap_.nspan; i++ {
+		s := h_allspans[i]
+		if s.state != mSpanInUse {
+			continue
+		}
+		if s.sizeclass == 0 {
+			memstats.nmalloc++
+			memstats.alloc += uint64(s.elemsize)
+		} else {
+			memstats.nmalloc += uint64(s.ref)
+			memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref)
+			memstats.alloc += uint64(s.ref) * uint64(s.elemsize)
+		}
+	}
+	unlock(&mheap_.lock)
+
+	// Aggregate by size class.
+	smallfree := uint64(0)
+	memstats.nfree = mheap_.nlargefree
+	for i := 0; i < len(memstats.by_size); i++ {
+		memstats.nfree += mheap_.nsmallfree[i]
+		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
+		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
+		smallfree += uint64(mheap_.nsmallfree[i]) * uint64(class_to_size[i])
+	}
+	memstats.nfree += memstats.tinyallocs
+	memstats.nmalloc += memstats.nfree
+
+	// Calculate derived stats.
+	memstats.total_alloc = uint64(memstats.alloc) + uint64(mheap_.largefree) + smallfree
+	memstats.heap_alloc = memstats.alloc
+	memstats.heap_objects = memstats.nmalloc - memstats.nfree
+}
+
+//go:nowritebarrier
+func cachestats() {
+	for i := 0; ; i++ {
+		p := allp[i]
+		if p == nil {
+			break
+		}
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		purgecachedstats(c)
+	}
+}
+
+//go:nowritebarrier
+func flushallmcaches() {
+	for i := 0; ; i++ {
+		p := allp[i]
+		if p == nil {
+			break
+		}
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		mCache_ReleaseAll(c)
+		stackcache_clear(c)
+	}
+}
+
+//go:nosplit
+func purgecachedstats(c *mcache) {
+	// Protected by either heap or GC lock.
+	h := &mheap_
+	memstats.heap_alloc += uint64(c.local_cachealloc)
+	c.local_cachealloc = 0
+	if trace.enabled {
+		traceHeapAlloc()
+	}
+	memstats.tinyallocs += uint64(c.local_tinyallocs)
+	c.local_tinyallocs = 0
+	memstats.nlookup += uint64(c.local_nlookup)
+	c.local_nlookup = 0
+	h.largefree += uint64(c.local_largefree)
+	c.local_largefree = 0
+	h.nlargefree += uint64(c.local_nlargefree)
+	c.local_nlargefree = 0
+	for i := 0; i < len(c.local_nsmallfree); i++ {
+		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
+		c.local_nsmallfree[i] = 0
+	}
 }
--- a/src/runtime/proc1.go
+++ b/src/runtime/proc1.go
@@ -528,6 +528,21 @@ func quiesce(mastergp *g) {
 	mcall(mquiesce)
 }

+// Holding worldsema grants an M the right to try to stop the world.
+// The procedure is:
+//
+//	semacquire(&worldsema);
+//	m.preemptoff = "reason";
+//	stoptheworld();
+//
+//	... do stuff ...
+//
+//	m.preemptoff = "";
+//	semrelease(&worldsema);
+//	starttheworld();
+//
+var worldsema uint32 = 1
+
 // This is used by the GC as well as the routines that do stack dumps. In the case
 // of GC all the routines can be reliably stopped. This is not always the case
 // when the system is in panic or being exited.

--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -239,3 +239,13 @@ func prefetcht0(addr uintptr)
 func prefetcht1(addr uintptr)
 func prefetcht2(addr uintptr)
 func prefetchnta(addr uintptr)
+
+func unixnanotime() int64 {
+	sec, nsec := time_now()
+	return sec*1e9 + int64(nsec)
+}
+
+// round n up to a multiple of a.  a must be a power of 2.
+func round(n, a uintptr) uintptr {
+	return (n + a - 1) &^ (a - 1)
+}
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -299,3 +299,17 @@ func readvarint(p []byte) (newp []byte, val uint32) {
 	}
 	return p, v
 }
+
+type stackmap struct {
+	n        int32   // number of bitmaps
+	nbit     int32   // number of bits in each bitmap
+	bytedata [1]byte // bitmaps, each starting on a 32-bit boundary
+}
+
+//go:nowritebarrier
+func stackmapdata(stkmap *stackmap, n int32) bitvector {
+	if n < 0 || n >= stkmap.n {
+		throw("stackmapdata: index out of range")
+	}
+	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+31)/32*4))))}
+}