cmd/compile: Use Sreedhar+Gao phi building algorithm

Should be more asymptotically happy. We process each variable in turn to find all the locations where it needs a phi (the dominance frontier of all of its definitions). Then we add all those phis. This takes O(n * #variables), although hopefully much less. Then we do a single tree walk to match all the FwdRefs with the nearest definition or phi. This takes O(n) time. The one remaining inefficiency is that we might end up introducing a bunch of dead phis in the first step. A TODO is to introduce phis only where they might be used by a read. The old algorithm is still faster on small functions, so there's a cutover size (currently 500 blocks). This algorithm supercedes the David's sparse phi placement algorithm for large functions. Lowers compile time of example from #14934 from ~10 sec to ~4 sec. Lowers compile time of example from #16361 from ~4.5 sec to ~3 sec. Lowers #16407 from ~20 min to ~30 sec. Update #14934 Update #16361 Fixes #16407 Change-Id: I1cff6364e1623c143190b6a924d7599e309db58f Reviewed-on: https://go-review.googlesource.com/30163Reviewed-by: David Chase <drchase@google.com>

cmd/compile: Use Sreedhar+Gao phi building algorithm
Should be more asymptotically happy. We process each variable in turn to find all the locations where it needs a phi (the dominance frontier of all of its definitions). Then we add all those phis. This takes O(n * #variables), although hopefully much less. Then we do a single tree walk to match all the FwdRefs with the nearest definition or phi. This takes O(n) time. The one remaining inefficiency is that we might end up introducing a bunch of dead phis in the first step. A TODO is to introduce phis only where they might be used by a read. The old algorithm is still faster on small functions, so there's a cutover size (currently 500 blocks). This algorithm supercedes the David's sparse phi placement algorithm for large functions. Lowers compile time of example from #14934 from ~10 sec to ~4 sec. Lowers compile time of example from #16361 from ~4.5 sec to ~3 sec. Lowers #16407 from ~20 min to ~30 sec. Update #14934 Update #16361 Fixes #16407 Change-Id: I1cff6364e1623c143190b6a924d7599e309db58f Reviewed-on: https://go-review.googlesource.com/30163Reviewed-by: David Chase <drchase@google.com>
5a6e511c · Keith Randall · d0e92f61 · 5a6e511c · 5a6e511c · d0e92f61
Commit 5a6e511c authored Sep 30, 2016 by Keith Randall
6 changed files
--- a/src/cmd/compile/internal/gc/phi.go
+++ b/src/cmd/compile/internal/gc/phi.go
--- a/src/cmd/compile/internal/gc/racewalk.go
+++ b/src/cmd/compile/internal/gc/racewalk.go
@@ -72,6 +72,7 @@ func instrument(fn *Node) {
 		fn.Func.Enter.Prepend(nd)
 		nd = mkcall("racefuncexit", nil, nil)
 		fn.Func.Exit.Append(nd)
+		fn.Func.Dcl = append(fn.Func.Dcl, &nodpc)
 	}

 	if Debug['W'] != 0 {

--- a/src/cmd/compile/internal/gc/sparselocatephifunctions.go
+++ b/src/cmd/compile/internal/gc/sparselocatephifunctions.go
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package gc
-
-import (
-	"cmd/compile/internal/ssa"
-	"fmt"
-	"math"
-)
-
-// sparseDefState contains a Go map from ONAMEs (*Node) to sparse definition trees, and
-// a search helper for the CFG's dominator tree in which those definitions are embedded.
-// Once initialized, given a use of an ONAME within a block, the ssa definition for
-// that ONAME can be discovered in time roughly proportional to the log of the number
-// of SSA definitions of that ONAME (thus avoiding pathological quadratic behavior for
-// very large programs).  The helper contains state (a dominator tree numbering) common
-// to all the sparse definition trees, as well as some necessary data obtained from
-// the ssa package.
-//
-// This algorithm has improved asymptotic complexity, but the constant factor is
-// rather large and thus it is only preferred for very large inputs containing
-// 1000s of blocks and variables.
-type sparseDefState struct {
-	helper         *ssa.SparseTreeHelper // contains one copy of information needed to do sparse mapping
-	defmapForOname map[*Node]*onameDefs  // for each ONAME, its definition set (normal and phi)
-}
-
-// onameDefs contains a record of definitions (ordinary and implied phi function) for a single OName.
-// stm is the set of definitions for the OName.
-// firstdef and lastuse are postorder block numberings that
-// conservatively bracket the entire lifetime of the OName.
-type onameDefs struct {
-	stm *ssa.SparseTreeMap
-	// firstdef and lastuse define an interval in the postorder numbering
-	// that is guaranteed to include the entire lifetime of an ONAME.
-	// In the postorder numbering, math.MaxInt32 is before anything,
-	// and 0 is after-or-equal all exit nodes and infinite loops.
-	firstdef int32 // the first definition of this ONAME *in the postorder numbering*
-	lastuse  int32 // the last use of this ONAME *in the postorder numbering*
-}
-
-// defsFor finds or creates-and-inserts-in-map the definition information
-// (sparse tree and live range) for a given OName.
-func (m *sparseDefState) defsFor(n *Node) *onameDefs {
-	d := m.defmapForOname[n]
-	if d != nil {
-		return d
-	}
-	// Reminder: firstdef/lastuse are postorder indices, not block indices,
-	// so these default values define an empty interval, not the entire one.
-	d = &onameDefs{stm: m.helper.NewTree(), firstdef: 0, lastuse: math.MaxInt32}
-	m.defmapForOname[n] = d
-	return d
-}
-
-// Insert adds a definition at b (with specified before/within/after adjustment)
-// to sparse tree onameDefs.  The lifetime is extended as necessary.
-func (m *sparseDefState) Insert(tree *onameDefs, b *ssa.Block, adjust int32) {
-	bponum := m.helper.Ponums[b.ID]
-	if bponum > tree.firstdef {
-		tree.firstdef = bponum
-	}
-	tree.stm.Insert(b, adjust, b, m.helper)
-}
-
-// Use updates tree to record a use within b, extending the lifetime as necessary.
-func (m *sparseDefState) Use(tree *onameDefs, b *ssa.Block) {
-	bponum := m.helper.Ponums[b.ID]
-	if bponum < tree.lastuse {
-		tree.lastuse = bponum
-	}
-}
-
-// locatePotentialPhiFunctions finds all the places where phi functions
-// will be inserted into a program and records those and ordinary definitions
-// in a "map" (not a Go map) that given an OName and use site, returns the
-// SSA definition for that OName that will reach the use site (that is,
-// the use site's nearest def/phi site in the dominator tree.)
-func (s *state) locatePotentialPhiFunctions(fn *Node) *sparseDefState {
-	// s.config.SparsePhiCutoff() is compared with product of numblocks and numvalues,
-	// if product is smaller than cutoff, use old non-sparse method.
-	// cutoff == 0 implies all sparse
-	// cutoff == uint(-1) implies all non-sparse
-	if uint64(s.f.NumValues())*uint64(s.f.NumBlocks()) < s.config.SparsePhiCutoff() {
-		return nil
-	}
-
-	helper := ssa.NewSparseTreeHelper(s.f)
-	po := helper.Po // index by block.ID to obtain postorder # of block.
-	trees := make(map[*Node]*onameDefs)
-	dm := &sparseDefState{defmapForOname: trees, helper: helper}
-
-	// Process params, taking note of their special lifetimes
-	b := s.f.Entry
-	for _, n := range fn.Func.Dcl {
-		switch n.Class {
-		case PPARAM, PPARAMOUT:
-			t := dm.defsFor(n)
-			dm.Insert(t, b, ssa.AdjustBefore) // define param at entry block
-			if n.Class == PPARAMOUT {
-				dm.Use(t, po[0]) // Explicitly use PPARAMOUT at very last block
-			}
-		default:
-		}
-	}
-
-	// Process memory variable.
-	t := dm.defsFor(&memVar)
-	dm.Insert(t, b, ssa.AdjustBefore) // define memory at entry block
-	dm.Use(t, po[0])                  // Explicitly use memory at last block
-
-	// Next load the map w/ basic definitions for ONames recorded per-block
-	// Iterate over po to avoid unreachable blocks.
-	for i := len(po) - 1; i >= 0; i-- {
-		b := po[i]
-		m := s.defvars[b.ID]
-		for n := range m { // no specified order, but per-node trees are independent.
-			t := dm.defsFor(n)
-			dm.Insert(t, b, ssa.AdjustWithin)
-		}
-	}
-
-	// Find last use of each variable
-	for _, v := range s.fwdRefs {
-		b := v.Block
-		name := v.Aux.(*Node)
-		t := dm.defsFor(name)
-		dm.Use(t, b)
-	}
-
-	for _, t := range trees {
-		// iterating over names in the outer loop
-		for change := true; change; {
-			change = false
-			for i := t.firstdef; i >= t.lastuse; i-- {
-				// Iterating in reverse of post-order reduces number of 'change' iterations;
-				// all possible forward flow goes through each time.
-				b := po[i]
-				// Within tree t, would a use at b require a phi function to ensure a single definition?
-				// TODO: perhaps more efficient to record specific use sites instead of range?
-				if len(b.Preds) < 2 {
-					continue // no phi possible
-				}
-				phi := t.stm.Find(b, ssa.AdjustWithin, helper) // Look for defs in earlier block or AdjustBefore in this one.
-				if phi != nil && phi.(*ssa.Block) == b {
-					continue // has a phi already in this block.
-				}
-				var defseen interface{}
-				// Do preds see different definitions? if so, need a phi function.
-				for _, e := range b.Preds {
-					p := e.Block()
-					dm.Use(t, p)                                // always count phi pred as "use"; no-op except for loop edges, which matter.
-					x := t.stm.Find(p, ssa.AdjustAfter, helper) // Look for defs reaching or within predecessors.
-					if x == nil {                               // nil def from a predecessor means a backedge that will be visited soon.
-						continue
-					}
-					if defseen == nil {
-						defseen = x
-					}
-					if defseen != x {
-						// Need to insert a phi function here because predecessors's definitions differ.
-						change = true
-						// Phi insertion is at AdjustBefore, visible with find in same block at AdjustWithin or AdjustAfter.
-						dm.Insert(t, b, ssa.AdjustBefore)
-						break
-					}
-				}
-			}
-		}
-	}
-	return dm
-}
-
-// FindBetterDefiningBlock tries to find a better block for a definition of OName name
-// reaching (or within) p than p itself.  If it cannot, it returns p instead.
-// This aids in more efficient location of phi functions, since it can skip over
-// branch code that might contain a definition of name if it actually does not.
-func (m *sparseDefState) FindBetterDefiningBlock(name *Node, p *ssa.Block) *ssa.Block {
-	if m == nil {
-		return p
-	}
-	t := m.defmapForOname[name]
-	// For now this is fail-soft, since the old algorithm still works using the unimproved block.
-	if t == nil {
-		return p
-	}
-	x := t.stm.Find(p, ssa.AdjustAfter, m.helper)
-	if x == nil {
-		return p
-	}
-	b := x.(*ssa.Block)
-	if b == nil {
-		return p
-	}
-	return b
-}
-
-func (d *onameDefs) String() string {
-	return fmt.Sprintf("onameDefs:first=%d,last=%d,tree=%s", d.firstdef, d.lastuse, d.stm.String())
-}
--- a/src/cmd/compile/internal/gc/ssa.go
+++ b/src/cmd/compile/internal/gc/ssa.go
@@ -80,6 +80,7 @@ func buildssa(fn *Node) *ssa.Func {
 	// Allocate starting values
 	s.labels = map[string]*ssaLabel{}
 	s.labeledNodes = map[*Node]*ssaLabel{}
+	s.fwdVars = map[*Node]*ssa.Value{}
 	s.startmem = s.entryNewValue0(ssa.OpInitMem, ssa.TypeMem)
 	s.sp = s.entryNewValue0(ssa.OpSP, Types[TUINTPTR]) // TODO: use generic pointer type (unsafe.Pointer?) instead
 	s.sb = s.entryNewValue0(ssa.OpSB, Types[TUINTPTR])
@@ -114,6 +115,21 @@ func buildssa(fn *Node) *ssa.Func {
 		}
 	}

+	// Populate arguments.
+	for _, n := range fn.Func.Dcl {
+		if n.Class != PPARAM {
+			continue
+		}
+		var v *ssa.Value
+		if s.canSSA(n) {
+			v = s.newValue0A(ssa.OpArg, n.Type, n)
+		} else {
+			// Not SSAable. Load it.
+			v = s.newValue2(ssa.OpLoad, n.Type, s.decladdrs[n], s.startmem)
+		}
+		s.vars[n] = v
+	}
+
 	// Convert the AST-based IR to the SSA-based IR
 	s.stmts(fn.Func.Enter)
 	s.stmts(fn.Nbody)
@@ -151,16 +167,7 @@ func buildssa(fn *Node) *ssa.Func {
 		return nil
 	}

-	prelinkNumvars := s.f.NumValues()
-	sparseDefState := s.locatePotentialPhiFunctions(fn)
-
-	// Link up variable uses to variable definitions
-	s.linkForwardReferences(sparseDefState)
-
-	if ssa.BuildStats > 0 {
-		s.f.LogStat("build", s.f.NumBlocks(), "blocks", prelinkNumvars, "vars_before",
-			s.f.NumValues(), "vars_after", prelinkNumvars*s.f.NumBlocks(), "ssa_phi_loc_cutoff_score")
-	}
+	s.insertPhis()

 	// Don't carry reference this around longer than necessary
 	s.exitCode = Nodes{}
@@ -197,8 +204,14 @@ type state struct {

 	// variable assignments in the current block (map from variable symbol to ssa value)
 	// *Node is the unique identifier (an ONAME Node) for the variable.
+	// TODO: keep a single varnum map, then make all of these maps slices instead?
 	vars map[*Node]*ssa.Value

+	// fwdVars are variables that are used before they are defined in the current block.
+	// This map exists just to coalesce multiple references into a single FwdRef op.
+	// *Node is the unique identifier (an ONAME Node) for the variable.
+	fwdVars map[*Node]*ssa.Value
+
 	// all defined variables at the end of each block. Indexed by block ID.
 	defvars []map[*Node]*ssa.Value

@@ -220,12 +233,12 @@ type state struct {
 	// Used to deduplicate panic calls.
 	panics map[funcLine]*ssa.Block

-	// list of FwdRef values.
-	fwdRefs []*ssa.Value
-
 	// list of PPARAMOUT (return) variables.
 	returns []*Node

+	// A dummy value used during phi construction.
+	placeholder *ssa.Value
+
 	cgoUnsafeArgs bool
 	noWB          bool
 	WBLineno      int32 // line number of first write barrier. 0=no write barriers
@@ -292,6 +305,9 @@ func (s *state) startBlock(b *ssa.Block) {
 	}
 	s.curBlock = b
 	s.vars = map[*Node]*ssa.Value{}
+	for n := range s.fwdVars {
+		delete(s.fwdVars, n)
+	}
 }

 // endBlock marks the end of generating code for the current block.
@@ -2951,9 +2967,8 @@ func (s *state) addr(n *Node, bounded bool) (*ssa.Value, bool) {
 			if v != nil {
 				return v, false
 			}
-			if n.String() == ".fp" {
-				// Special arg that points to the frame pointer.
-				// (Used by the race detector, others?)
+			if n == nodfp {
+				// Special arg that points to the frame pointer (Used by ORECOVER).
 				aux := s.lookupSymbol(n, &ssa.ArgSymbol{Typ: n.Type, Node: n})
 				return s.entryNewValue1A(ssa.OpAddr, t, aux, s.sp), false
 			}
@@ -3971,132 +3986,30 @@ func (s *state) checkgoto(from *Node, to *Node) {
 // variable returns the value of a variable at the current location.
 func (s *state) variable(name *Node, t ssa.Type) *ssa.Value {
 	v := s.vars[name]
-	if v == nil {
-		v = s.newValue0A(ssa.OpFwdRef, t, name)
-		s.fwdRefs = append(s.fwdRefs, v)
-		s.vars[name] = v
-		s.addNamedValue(name, v)
-	}
-	return v
-}
-
-func (s *state) mem() *ssa.Value {
-	return s.variable(&memVar, ssa.TypeMem)
-}
-
-func (s *state) linkForwardReferences(dm *sparseDefState) {
-
-	// Build SSA graph. Each variable on its first use in a basic block
-	// leaves a FwdRef in that block representing the incoming value
-	// of that variable. This function links that ref up with possible definitions,
-	// inserting Phi values as needed. This is essentially the algorithm
-	// described by Braun, Buchwald, Hack, Leißa, Mallon, and Zwinkau:
-	// http://pp.info.uni-karlsruhe.de/uploads/publikationen/braun13cc.pdf
-	// Differences:
-	//   - We use FwdRef nodes to postpone phi building until the CFG is
-	//     completely built. That way we can avoid the notion of "sealed"
-	//     blocks.
-	//   - Phi optimization is a separate pass (in ../ssa/phielim.go).
-	for len(s.fwdRefs) > 0 {
-		v := s.fwdRefs[len(s.fwdRefs)-1]
-		s.fwdRefs = s.fwdRefs[:len(s.fwdRefs)-1]
-		s.resolveFwdRef(v, dm)
-	}
-}
-
-// resolveFwdRef modifies v to be the variable's value at the start of its block.
-// v must be a FwdRef op.
-func (s *state) resolveFwdRef(v *ssa.Value, dm *sparseDefState) {
-	b := v.Block
-	name := v.Aux.(*Node)
-	v.Aux = nil
-	if b == s.f.Entry {
-		// Live variable at start of function.
-		if s.canSSA(name) {
-			if strings.HasPrefix(name.Sym.Name, "autotmp_") {
-				// It's likely that this is an uninitialized variable in the entry block.
-				s.Fatalf("Treating auto as if it were arg, func %s, node %v, value %v", b.Func.Name, name, v)
-			}
-			v.Op = ssa.OpArg
-			v.Aux = name
-			return
-		}
-		// Not SSAable. Load it.
-		addr := s.decladdrs[name]
-		if addr == nil {
-			// TODO: closure args reach here.
-			s.Fatalf("unhandled closure arg %v at entry to function %s", name, b.Func.Name)
-		}
-		if _, ok := addr.Aux.(*ssa.ArgSymbol); !ok {
-			s.Fatalf("variable live at start of function %s is not an argument %v", b.Func.Name, name)
-		}
-		v.Op = ssa.OpLoad
-		v.AddArgs(addr, s.startmem)
-		return
-	}
-	if len(b.Preds) == 0 {
-		// This block is dead; we have no predecessors and we're not the entry block.
-		// It doesn't matter what we use here as long as it is well-formed.
-		v.Op = ssa.OpUnknown
-		return
-	}
-	// Find variable value on each predecessor.
-	var argstore [4]*ssa.Value
-	args := argstore[:0]
-	for _, e := range b.Preds {
-		p := e.Block()
-		p = dm.FindBetterDefiningBlock(name, p) // try sparse improvement on p
-		args = append(args, s.lookupVarOutgoing(p, v.Type, name, v.Line))
-	}
-
-	// Decide if we need a phi or not. We need a phi if there
-	// are two different args (which are both not v).
-	var w *ssa.Value
-	for _, a := range args {
-		if a == v {
-			continue // self-reference
-		}
-		if a == w {
-			continue // already have this witness
-		}
-		if w != nil {
-			// two witnesses, need a phi value
-			v.Op = ssa.OpPhi
-			v.AddArgs(args...)
-			return
-		}
-		w = a // save witness
+	if v != nil {
+		return v
 	}
-	if w == nil {
-		s.Fatalf("no witness for reachable phi %s", v)
+	v = s.fwdVars[name]
+	if v != nil {
+		return v
 	}
-	// One witness. Make v a copy of w.
-	v.Op = ssa.OpCopy
-	v.AddArg(w)
-}

-// lookupVarOutgoing finds the variable's value at the end of block b.
-func (s *state) lookupVarOutgoing(b *ssa.Block, t ssa.Type, name *Node, line int32) *ssa.Value {
-	for {
-		if v, ok := s.defvars[b.ID][name]; ok {
-			return v
-		}
-		// The variable is not defined by b and we haven't looked it up yet.
-		// If b has exactly one predecessor, loop to look it up there.
-		// Otherwise, give up and insert a new FwdRef and resolve it later.
-		if len(b.Preds) != 1 {
-			break
-		}
-		b = b.Preds[0].Block()
+	if s.curBlock == s.f.Entry {
+		// No variable should be live at entry.
+		s.Fatalf("Value live at entry. It shouldn't be. func %s, node %v, value %v", s.f.Name, name, v)
 	}
-	// Generate a FwdRef for the variable and return that.
-	v := b.NewValue0A(line, ssa.OpFwdRef, t, name)
-	s.fwdRefs = append(s.fwdRefs, v)
-	s.defvars[b.ID][name] = v
+	// Make a FwdRef, which records a value that's live on block input.
+	// We'll find the matching definition as part of insertPhis.
+	v = s.newValue0A(ssa.OpFwdRef, t, name)
+	s.fwdVars[name] = v
 	s.addNamedValue(name, v)
 	return v
 }

+func (s *state) mem() *ssa.Value {
+	return s.variable(&memVar, ssa.TypeMem)
+}
+
 func (s *state) addNamedValue(n *Node, v *ssa.Value) {
 	if n.Class == Pxxx {
 		// Don't track our dummy nodes (&memVar etc.).

--- a/src/cmd/compile/internal/ssa/block.go
+++ b/src/cmd/compile/internal/ssa/block.go
@@ -89,6 +89,9 @@ type Edge struct {
 func (e Edge) Block() *Block {
 	return e.b
 }
+func (e Edge) Index() int {
+	return e.i
+}

 //     kind           control    successors
 //   ------------------------------------------

--- a/src/cmd/compile/internal/ssa/func.go
+++ b/src/cmd/compile/internal/ssa/func.go
@@ -459,6 +459,9 @@ func (f *Func) idom() []*Block {
 	}
 	return f.cachedIdom
 }
+func (f *Func) Idom() []*Block {
+	return f.idom()
+}

 // sdom returns a sparse tree representing the dominator relationships
 // among the blocks of f.