cmd/gc: move flow graph into portable opt

Now there's only one copy of the flow graph construction and dominator computation, and different optimizations can attach different annotations to the instructions. R=ken2 CC=golang-dev https://golang.org/cl/12797045

cmd/gc: move flow graph into portable opt
Now there's only one copy of the flow graph construction and dominator computation, and different optimizations can attach different annotations to the instructions. R=ken2 CC=golang-dev https://golang.org/cl/12797045
dbf96add · Russ Cox · 954d1474 · dbf96add · dbf96add · dbf96add
Commit dbf96add authored Aug 12, 2013 by Russ Cox
11 changed files
--- a/src/cmd/5g/opt.h
+++ b/src/cmd/5g/opt.h
@@ -55,6 +55,7 @@ typedef	struct	Rgn	Rgn;
 // r->prog->opt points back to r.
 struct	Reg
 {
+	Flow	f;

 	Bits	set;  		// variables written by this instruction.
 	Bits	use1; 		// variables read by prog->from.
@@ -68,19 +69,6 @@ struct	Reg
 	Bits	act;

 	int32	regu;		// register used bitmap
-	int32	rpo;		// reverse post ordering
-	int32	active;
-
-	uint16	loop;		// x5 for every loop
-	uchar	refset;		// diagnostic generated
-
-	Reg*	p1;     	// predecessors of this instruction: p1,
-	Reg*	p2;     	// and then p2 linked though p2link.
-	Reg*	p2link;
-	Reg*	s1;     	// successors of this instruction (at most two: s1 and s2).
-	Reg*	s2;
-	Reg*	link;   	// next instruction in function code
-	Prog*	prog;   	// actual instruction
 };
 #define	R	((Reg*)0)

@@ -96,7 +84,6 @@ struct	Rgn
 EXTERN	int32	exregoffset;		// not set
 EXTERN	int32	exfregoffset;		// not set
 EXTERN	Reg*	firstr;
-EXTERN	Reg*	lastr;
 EXTERN	Reg	zreg;
 EXTERN	Reg*	freer;
 EXTERN	Reg**	rpo2r;
@@ -134,34 +121,21 @@ void	regopt(Prog*);
 void	addmove(Reg*, int, int, int);
 Bits	mkvar(Reg *r, Adr *a);
 void	prop(Reg*, Bits, Bits);
-void	loopit(Reg*, int32);
 void	synch(Reg*, Bits);
 uint32	allreg(uint32, Rgn*);
 void	paint1(Reg*, int);
 uint32	paint2(Reg*, int);
 void	paint3(Reg*, int, int32, int);
 void	addreg(Adr*, int);
-void	dumpit(char *str, Reg *r0);
+void	dumpit(char *str, Flow *r0, int);

 /*
 * peep.c
 */
-void	peep(void);
-void	excise(Reg*);
-Reg*	uniqp(Reg*);
-Reg*	uniqs(Reg*);
-int	regtyp(Adr*);
-int	anyvar(Adr*);
-int	subprop(Reg*);
-int	copyprop(Reg*);
-int	copy1(Adr*, Adr*, Reg*, int);
+void	peep(Prog*);
+void	excise(Flow*);
 int	copyu(Prog*, Adr*, Adr*);

-int	copyas(Adr*, Adr*);
-int	copyau(Adr*, Adr*);
-int	copysub(Adr*, Adr*, Adr*, int);
-int	copysub1(Prog*, Adr*, Adr*, int);
-
 int32	RtoB(int);
 int32	FtoB(int);
 int	BtoR(int32);

--- a/src/cmd/5g/peep.c
+++ b/src/cmd/5g/peep.c
--- a/src/cmd/5g/reg.c
+++ b/src/cmd/5g/reg.c
--- a/src/cmd/6g/opt.h
+++ b/src/cmd/6g/opt.h
@@ -55,6 +55,7 @@ typedef	struct	Rgn	Rgn;
 // r->prog->opt points back to r.
 struct	Reg
 {
+	Flow	f;

 	Bits	set;  		// variables written by this instruction.
 	Bits	use1; 		// variables read by prog->from.
@@ -68,19 +69,6 @@ struct	Reg
 	Bits	act;

 	int32	regu;		// register used bitmap
-	int32	rpo;		// reverse post ordering
-	int32	active;
-
-	uint16	loop;		// x5 for every loop
-	uchar	refset;		// diagnostic generated
-
-	Reg*	p1;     	// predecessors of this instruction: p1,
-	Reg*	p2;     	// and then p2 linked though p2link.
-	Reg*	p2link;
-	Reg*	s1;     	// successors of this instruction (at most two: s1 and s2).
-	Reg*	s2;
-	Reg*	link;   	// next instruction in function code
-	Prog*	prog;   	// actual instruction
 };
 #define	R	((Reg*)0)

@@ -96,10 +84,7 @@ struct	Rgn
 EXTERN	int32	exregoffset;		// not set
 EXTERN	int32	exfregoffset;		// not set
 EXTERN	Reg*	firstr;
-EXTERN	Reg*	lastr;
 EXTERN	Reg	zreg;
-EXTERN	Reg*	freer;
-EXTERN	Reg**	rpo2r;
 EXTERN	Rgn	region[NRGN];
 EXTERN	Rgn*	rgp;
 EXTERN	int	nregion;
@@ -113,7 +98,6 @@ EXTERN	Bits	addrs;
 EXTERN	Bits	ovar;
 EXTERN	int	change;
 EXTERN	int32	maxnr;
-EXTERN	int32*	idom;

 EXTERN	struct
 {
@@ -128,41 +112,27 @@ EXTERN	struct
 /*
 * reg.c
 */
-Reg*	rega(void);
 int	rcmp(const void*, const void*);
 void	regopt(Prog*);
 void	addmove(Reg*, int, int, int);
 Bits	mkvar(Reg*, Adr*);
 void	prop(Reg*, Bits, Bits);
-void	loopit(Reg*, int32);
 void	synch(Reg*, Bits);
 uint32	allreg(uint32, Rgn*);
 void	paint1(Reg*, int);
 uint32	paint2(Reg*, int);
 void	paint3(Reg*, int, int32, int);
 void	addreg(Adr*, int);
-void	dumpone(Reg*);
-void	dumpit(char*, Reg*);
+void	dumpone(Flow*, int);
+void	dumpit(char*, Flow*, int);

 /*
 * peep.c
 */
-void	peep(void);
-void	excise(Reg*);
-Reg*	uniqp(Reg*);
-Reg*	uniqs(Reg*);
-int	regtyp(Adr*);
-int	anyvar(Adr*);
-int	subprop(Reg*);
-int	copyprop(Reg*);
-int	copy1(Adr*, Adr*, Reg*, int);
+void	peep(Prog*);
+void	excise(Flow*);
 int	copyu(Prog*, Adr*, Adr*);

-int	copyas(Adr*, Adr*);
-int	copyau(Adr*, Adr*);
-int	copysub(Adr*, Adr*, Adr*, int);
-int	copysub1(Prog*, Adr*, Adr*, int);
-
 int32	RtoB(int);
 int32	FtoB(int);
 int	BtoR(int32);

--- a/src/cmd/6g/peep.c
+++ b/src/cmd/6g/peep.c
@@ -33,11 +33,18 @@
 #include "gg.h"
 #include "opt.h"

-static void	conprop(Reg *r);
-static void elimshortmov(Reg *r);
-static int prevl(Reg *r, int reg);
-static void pushback(Reg *r);
-static int regconsttyp(Adr*);
+static void	conprop(Flow *r);
+static void	elimshortmov(Graph *g);
+static int	prevl(Flow *r, int reg);
+static void	pushback(Flow *r);
+static int	regconsttyp(Adr*);
+static int	regtyp(Adr*);
+static int	subprop(Flow*);
+static int	copyprop(Graph*, Flow*);
+static int	copy1(Adr*, Adr*, Flow*, int);
+static int	copyas(Adr*, Adr*);
+static int	copyau(Adr*, Adr*);
+static int	copysub(Adr*, Adr*, Adr*, int);

 // do we need the carry bit
 static int
@@ -56,19 +63,19 @@ needc(Prog *p)
 	return 0;
 }

-static Reg*
-rnops(Reg *r)
+static Flow*
+rnops(Flow *r)
 {
 	Prog *p;
-	Reg *r1;
+	Flow *r1;

-	if(r != R)
+	if(r != nil)
 	for(;;) {
 		p = r->prog;
 		if(p->as != ANOP || p->from.type != D_NONE || p->to.type != D_NONE)
 			break;
 		r1 = uniqs(r);
-		if(r1 == R)
+		if(r1 == nil)
 			break;
 		r = r1;
 	}
@@ -76,52 +83,25 @@ rnops(Reg *r)
 }

 void
-peep(void)
+peep(Prog *firstp)
 {
-	Reg *r, *r1, *r2;
+	Flow *r, *r1;
+	Graph *g;
 	Prog *p, *p1;
 	int t;
-	ProgInfo info;
-
-	/*
-	 * complete R structure
-	 */
-	t = 0;
-	for(r=firstr; r!=R; r=r1) {
-		r1 = r->link;
-		if(r1 == R)
-			break;
-		p = r->prog->link;
-		for(p = r->prog->link; p != r1->prog; p = p->link) {
-			proginfo(&info, p);
-			if(info.flags & Skip)
-				continue;
-
-			r2 = rega();
-			r->link = r2;
-			r2->link = r1;

-			r2->prog = p;
-			p->opt = r2;
-
-			r2->p1 = r;
-			r->s1 = r2;
-			r2->s1 = r1;
-			r1->p1 = r2;
+	g = flowstart(firstp, sizeof(Flow));
+	if(g == nil)
+		return;

-			r = r2;
-			t++;
-		}
-	}
-	
 	// byte, word arithmetic elimination.
-	elimshortmov(r);
+	elimshortmov(g);

 	// constant propagation
-	// find MOV $con,R followed by
-	// another MOV $con,R without
-	// setting R in the interim
-	for(r=firstr; r!=R; r=r->link) {
+	// find MOV $con,nil followed by
+	// another MOV $con,nil without
+	// setting nil in the interim
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		switch(p->as) {
 		case ALEAL:
@@ -147,10 +127,10 @@ peep(void)

 loop1:
 	if(debug['P'] && debug['v'])
-		dumpit("loop1", firstr);
+		dumpit("loop1", g->start, 0);

 	t = 0;
-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		switch(p->as) {
 		case AMOVL:
@@ -159,11 +139,11 @@ loop1:
 		case AMOVSD:
 			if(regtyp(&p->to))
 			if(regtyp(&p->from)) {
-				if(copyprop(r)) {
+				if(copyprop(g, r)) {
 					excise(r);
 					t++;
 				} else
-				if(subprop(r) && copyprop(r)) {
+				if(subprop(r) && copyprop(g, r)) {
 					excise(r);
 					t++;
 				}
@@ -176,7 +156,7 @@ loop1:
 		case AMOVWLSX:
 			if(regtyp(&p->to)) {
 				r1 = rnops(uniqs(r));
-				if(r1 != R) {
+				if(r1 != nil) {
 					p1 = r1->prog;
 					if(p->as == p1->as && p->to.type == p1->from.type){
 						p1->as = AMOVL;
@@ -195,7 +175,7 @@ loop1:
 		case AMOVQL:
 			if(regtyp(&p->to)) {
 				r1 = rnops(uniqs(r));
-				if(r1 != R) {
+				if(r1 != nil) {
 					p1 = r1->prog;
 					if(p->as == p1->as && p->to.type == p1->from.type){
 						p1->as = AMOVQ;
@@ -278,7 +258,7 @@ loop1:
 	// can be replaced by MOVAPD, which moves the pair of float64s
 	// instead of just the lower one.  We only use the lower one, but
 	// the processor can do better if we do moves using both.
-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		if(p->as == AMOVLQZX)
 		if(regtyp(&p->from))
@@ -295,7 +275,7 @@ loop1:
 	// load pipelining
 	// push any load from memory as early as possible
 	// to give it time to complete before use.
-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		switch(p->as) {
 		case AMOVB:
@@ -307,17 +287,19 @@ loop1:
 				pushback(r);
 		}
 	}
+	
+	flowend(g);
 }

 static void
-pushback(Reg *r0)
+pushback(Flow *r0)
 {
-	Reg *r, *b;
+	Flow *r, *b;
 	Prog *p0, *p, t;
 	
-	b = R;
+	b = nil;
 	p0 = r0->prog;
-	for(r=uniqp(r0); r!=R && uniqs(r)!=R; r=uniqp(r)) {
+	for(r=uniqp(r0); r!=nil && uniqs(r)!=nil; r=uniqp(r)) {
 		p = r->prog;
 		if(p->as != ANOP) {
 			if(!regconsttyp(&p->from) || !regtyp(&p->to))
@@ -330,11 +312,11 @@ pushback(Reg *r0)
 		b = r;
 	}
 	
-	if(b == R) {
+	if(b == nil) {
 		if(debug['v']) {
 			print("no pushback: %P\n", r0->prog);
 			if(r)
-				print("\t%P [%d]\n", r->prog, uniqs(r)!=R);
+				print("\t%P [%d]\n", r->prog, uniqs(r)!=nil);
 		}
 		return;
 	}
@@ -377,7 +359,7 @@ pushback(Reg *r0)
 }

 void
-excise(Reg *r)
+excise(Flow *r)
 {
 	Prog *p;

@@ -392,39 +374,7 @@ excise(Reg *r)
 	ostats.ndelmov++;
 }

-Reg*
-uniqp(Reg *r)
-{
-	Reg *r1;
-
-	r1 = r->p1;
-	if(r1 == R) {
-		r1 = r->p2;
-		if(r1 == R || r1->p2link != R)
-			return R;
-	} else
-		if(r->p2 != R)
-			return R;
-	return r1;
-}
-
-Reg*
-uniqs(Reg *r)
-{
-	Reg *r1;
-
-	r1 = r->s1;
-	if(r1 == R) {
-		r1 = r->s2;
-		if(r1 == R)
-			return R;
-	} else
-		if(r->s2 != R)
-			return R;
-	return r1;
-}
-
-int
+static int
 regtyp(Adr *a)
 {
 	int t;
@@ -448,12 +398,12 @@ regtyp(Adr *a)
 // TODO: Using the Q forms here instead of the L forms
 // seems unnecessary, and it makes the instructions longer.
 static void
-elimshortmov(Reg *r)
+elimshortmov(Graph *g)
 {
 	Prog *p;
+	Flow *r;

-	USED(r);
-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		if(regtyp(&p->to)) {
 			switch(p->as) {
@@ -554,13 +504,13 @@ regconsttyp(Adr *a)

 // is reg guaranteed to be truncated by a previous L instruction?
 static int
-prevl(Reg *r0, int reg)
+prevl(Flow *r0, int reg)
 {
 	Prog *p;
-	Reg *r;
+	Flow *r;
 	ProgInfo info;

-	for(r=uniqp(r0); r!=R; r=uniqp(r)) {
+	for(r=uniqp(r0); r!=nil; r=uniqp(r)) {
 		p = r->prog;
 		if(p->to.type == reg) {
 			proginfo(&info, p);
@@ -588,13 +538,13 @@ prevl(Reg *r0, int reg)
 * hopefully, then the former or latter MOV
 * will be eliminated by copy propagation.
 */
-int
-subprop(Reg *r0)
+static int
+subprop(Flow *r0)
 {
 	Prog *p;
 	ProgInfo info;
 	Adr *v1, *v2;
-	Reg *r;
+	Flow *r;
 	int t;

 	if(debug['P'] && debug['v'])
@@ -612,10 +562,10 @@ subprop(Reg *r0)
 			print("\tnot regtype %D; return 0\n", v2);
 		return 0;
 	}
-	for(r=uniqp(r0); r!=R; r=uniqp(r)) {
+	for(r=uniqp(r0); r!=nil; r=uniqp(r)) {
 		if(debug['P'] && debug['v'])
 			print("\t? %P\n", r->prog);
-		if(uniqs(r) == R) {
+		if(uniqs(r) == nil) {
 			if(debug['P'] && debug['v'])
 				print("\tno unique successor\n");
 			break;
@@ -689,12 +639,12 @@ gotit:
 *	set v1	F=1
 *	set v2	return success
 */
-int
-copyprop(Reg *r0)
+static int
+copyprop(Graph *g, Flow *r0)
 {
 	Prog *p;
 	Adr *v1, *v2;
-	Reg *r;
+	Flow *r;

 	if(debug['P'] && debug['v'])
 		print("copyprop %P\n", r0->prog);
@@ -703,13 +653,13 @@ copyprop(Reg *r0)
 	v2 = &p->to;
 	if(copyas(v1, v2))
 		return 1;
-	for(r=firstr; r!=R; r=r->link)
+	for(r=g->start; r!=nil; r=r->link)
 		r->active = 0;
 	return copy1(v1, v2, r0->s1, 0);
 }

-int
-copy1(Adr *v1, Adr *v2, Reg *r, int f)
+static int
+copy1(Adr *v1, Adr *v2, Flow *r, int f)
 {
 	int t;
 	Prog *p;
@@ -722,11 +672,11 @@ copy1(Adr *v1, Adr *v2, Reg *r, int f)
 	r->active = 1;
 	if(debug['P'])
 		print("copy %D->%D f=%d\n", v1, v2, f);
-	for(; r != R; r = r->s1) {
+	for(; r != nil; r = r->s1) {
 		p = r->prog;
 		if(debug['P'])
 			print("%P", p);
-		if(!f && uniqp(r) == R) {
+		if(!f && uniqp(r) == nil) {
 			f = 1;
 			if(debug['P'])
 				print("; merge; f=%d", f);
@@ -880,7 +830,7 @@ copyu(Prog *p, Adr *v, Adr *s)
 * could be set/use depending on
 * semantics
 */
-int
+static int
 copyas(Adr *a, Adr *v)
 {
 	if(a->type != v->type)
@@ -896,7 +846,7 @@ copyas(Adr *a, Adr *v)
 /*
 * either direct or indirect
 */
-int
+static int
 copyau(Adr *a, Adr *v)
 {

@@ -924,7 +874,7 @@ copyau(Adr *a, Adr *v)
 * substitute s for v in a
 * return failure to substitute
 */
-int
+static int
 copysub(Adr *a, Adr *v, Adr *s, int f)
 {
 	int t;
@@ -957,9 +907,9 @@ copysub(Adr *a, Adr *v, Adr *s, int f)
 }

 static void
-conprop(Reg *r0)
+conprop(Flow *r0)
 {
-	Reg *r;
+	Flow *r;
 	Prog *p, *p0;
 	int t;
 	Adr *v0;
@@ -970,9 +920,9 @@ conprop(Reg *r0)

 loop:
 	r = uniqs(r);
-	if(r == R || r == r0)
+	if(r == nil || r == r0)
 		return;
-	if(uniqp(r) == R)
+	if(uniqp(r) == nil)
 		return;

 	p = r->prog;

--- a/src/cmd/6g/reg.c
+++ b/src/cmd/6g/reg.c
--- a/src/cmd/8g/opt.h
+++ b/src/cmd/8g/opt.h
@@ -55,6 +55,7 @@ typedef	struct	Rgn	Rgn;
 // r->prog->opt points back to r.
 struct	Reg
 {
+	Flow	f;

 	Bits	set;  		// variables written by this instruction.
 	Bits	use1; 		// variables read by prog->from.
@@ -96,7 +97,6 @@ struct	Rgn
 EXTERN	int32	exregoffset;		// not set
 EXTERN	int32	exfregoffset;		// not set
 EXTERN	Reg*	firstr;
-EXTERN	Reg*	lastr;
 EXTERN	Reg	zreg;
 EXTERN	Reg*	freer;
 EXTERN	Reg**	rpo2r;
@@ -141,28 +141,16 @@ void	paint1(Reg*, int);
 uint32	paint2(Reg*, int);
 void	paint3(Reg*, int, int32, int);
 void	addreg(Adr*, int);
-void	dumpone(Reg*);
-void	dumpit(char*, Reg*);
+void	dumpone(Flow*, int);
+void	dumpit(char*, Flow*, int);

 /*
 * peep.c
 */
-void	peep(void);
-void	excise(Reg*);
-Reg*	uniqp(Reg*);
-Reg*	uniqs(Reg*);
-int	regtyp(Adr*);
-int	anyvar(Adr*);
-int	subprop(Reg*);
-int	copyprop(Reg*);
-int	copy1(Adr*, Adr*, Reg*, int);
+void	peep(Prog*);
+void	excise(Flow*);
 int	copyu(Prog*, Adr*, Adr*);

-int	copyas(Adr*, Adr*);
-int	copyau(Adr*, Adr*);
-int	copysub(Adr*, Adr*, Adr*, int);
-int	copysub1(Prog*, Adr*, Adr*, int);
-
 int32	RtoB(int);
 int32	FtoB(int);
 int	BtoR(int32);

--- a/src/cmd/8g/peep.c
+++ b/src/cmd/8g/peep.c
@@ -35,8 +35,15 @@

 #define	REGEXT	0

-static void	conprop(Reg *r);
-static void elimshortmov(Reg *r);
+static void	conprop(Flow *r);
+static void	elimshortmov(Graph*);
+static int	regtyp(Adr*);
+static int	subprop(Flow*);
+static int	copyprop(Graph*, Flow*);
+static int	copy1(Adr*, Adr*, Flow*, int);
+static int	copyas(Adr*, Adr*);
+static int	copyau(Adr*, Adr*);
+static int	copysub(Adr*, Adr*, Adr*, int);

 // do we need the carry bit
 static int
@@ -55,19 +62,19 @@ needc(Prog *p)
 	return 0;
 }

-static Reg*
-rnops(Reg *r)
+static Flow*
+rnops(Flow *r)
 {
 	Prog *p;
-	Reg *r1;
+	Flow *r1;

-	if(r != R)
+	if(r != nil)
 	for(;;) {
 		p = r->prog;
 		if(p->as != ANOP || p->from.type != D_NONE || p->to.type != D_NONE)
 			break;
 		r1 = uniqs(r);
-		if(r1 == R)
+		if(r1 == nil)
 			break;
 		r = r1;
 	}
@@ -75,49 +82,25 @@ rnops(Reg *r)
 }

 void
-peep(void)
+peep(Prog *firstp)
 {
-	Reg *r, *r1, *r2;
+	Flow *r, *r1;
+	Graph *g;
 	Prog *p, *p1;
 	int t;
-	ProgInfo info;
-
-	/*
-	 * complete R structure
-	 */
-	for(r=firstr; r!=R; r=r1) {
-		r1 = r->link;
-		if(r1 == R)
-			break;
-		for(p = r->prog->link; p != r1->prog; p = p->link) {
-			proginfo(&info, p);
-			if(info.flags & Skip)
-				continue;
-
-			r2 = rega();
-			r->link = r2;
-			r2->link = r1;

-			r2->prog = p;
-			p->opt = r2;
-
-			r2->p1 = r;
-			r->s1 = r2;
-			r2->s1 = r1;
-			r1->p1 = r2;
-
-			r = r2;
-		}
-	}
+	g = flowstart(firstp, sizeof(Flow));
+	if(g == nil)
+		return;

 	// byte, word arithmetic elimination.
-	elimshortmov(r);
+	elimshortmov(g);

 	// constant propagation
-	// find MOV $con,R followed by
-	// another MOV $con,R without
-	// setting R in the interim
-	for(r=firstr; r!=R; r=r->link) {
+	// find MOV $con,nil followed by
+	// another MOV $con,nil without
+	// setting nil in the interim
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		switch(p->as) {
 		case ALEAL:
@@ -141,10 +124,10 @@ peep(void)

 loop1:
 	if(debug['P'] && debug['v'])
-		dumpit("loop1", firstr);
+		dumpit("loop1", g->start, 0);

 	t = 0;
-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		switch(p->as) {
 		case AMOVL:
@@ -152,11 +135,11 @@ loop1:
 		case AMOVSD:
 			if(regtyp(&p->to))
 			if(regtyp(&p->from)) {
-				if(copyprop(r)) {
+				if(copyprop(g, r)) {
 					excise(r);
 					t++;
 				} else
-				if(subprop(r) && copyprop(r)) {
+				if(subprop(r) && copyprop(g, r)) {
 					excise(r);
 					t++;
 				}
@@ -169,7 +152,7 @@ loop1:
 		case AMOVWLSX:
 			if(regtyp(&p->to)) {
 				r1 = rnops(uniqs(r));
-				if(r1 != R) {
+				if(r1 != nil) {
 					p1 = r1->prog;
 					if(p->as == p1->as && p->to.type == p1->from.type){
 						p1->as = AMOVL;
@@ -232,7 +215,7 @@ loop1:
 	// can be replaced by MOVAPD, which moves the pair of float64s
 	// instead of just the lower one.  We only use the lower one, but
 	// the processor can do better if we do moves using both.
-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		if(p->as == AMOVSD)
 		if(regtyp(&p->from))
@@ -242,7 +225,7 @@ loop1:
 }

 void
-excise(Reg *r)
+excise(Flow *r)
 {
 	Prog *p;

@@ -257,39 +240,7 @@ excise(Reg *r)
 	ostats.ndelmov++;
 }

-Reg*
-uniqp(Reg *r)
-{
-	Reg *r1;
-
-	r1 = r->p1;
-	if(r1 == R) {
-		r1 = r->p2;
-		if(r1 == R || r1->p2link != R)
-			return R;
-	} else
-		if(r->p2 != R)
-			return R;
-	return r1;
-}
-
-Reg*
-uniqs(Reg *r)
-{
-	Reg *r1;
-
-	r1 = r->s1;
-	if(r1 == R) {
-		r1 = r->s2;
-		if(r1 == R)
-			return R;
-	} else
-		if(r->s2 != R)
-			return R;
-	return r1;
-}
-
-int
+static int
 regtyp(Adr *a)
 {
 	int t;
@@ -310,11 +261,12 @@ regtyp(Adr *a)
 // can smash the entire 64-bit register without
 // causing any trouble.
 static void
-elimshortmov(Reg *r)
+elimshortmov(Graph *g)
 {
 	Prog *p;
+	Flow *r;

-	for(r=firstr; r!=R; r=r->link) {
+	for(r=g->start; r!=nil; r=r->link) {
 		p = r->prog;
 		if(regtyp(&p->to)) {
 			switch(p->as) {
@@ -409,12 +361,12 @@ elimshortmov(Reg *r)
 * hopefully, then the former or latter MOV
 * will be eliminated by copy propagation.
 */
-int
-subprop(Reg *r0)
+static int
+subprop(Flow *r0)
 {
 	Prog *p;
 	Adr *v1, *v2;
-	Reg *r;
+	Flow *r;
 	int t;
 	ProgInfo info;

@@ -425,10 +377,10 @@ subprop(Reg *r0)
 	v2 = &p->to;
 	if(!regtyp(v2))
 		return 0;
-	for(r=uniqp(r0); r!=R; r=uniqp(r)) {
+	for(r=uniqp(r0); r!=nil; r=uniqp(r)) {
 		if(debug['P'] && debug['v'])
 			print("\t? %P\n", r->prog);
-		if(uniqs(r) == R)
+		if(uniqs(r) == nil)
 			break;
 		p = r->prog;
 		proginfo(&info, p);
@@ -483,25 +435,25 @@ gotit:
 *	set v1	F=1
 *	set v2	return success
 */
-int
-copyprop(Reg *r0)
+static int
+copyprop(Graph *g, Flow *r0)
 {
 	Prog *p;
 	Adr *v1, *v2;
-	Reg *r;
+	Flow *r;

 	p = r0->prog;
 	v1 = &p->from;
 	v2 = &p->to;
 	if(copyas(v1, v2))
 		return 1;
-	for(r=firstr; r!=R; r=r->link)
+	for(r=g->start; r!=nil; r=r->link)
 		r->active = 0;
 	return copy1(v1, v2, r0->s1, 0);
 }

-int
-copy1(Adr *v1, Adr *v2, Reg *r, int f)
+static int
+copy1(Adr *v1, Adr *v2, Flow *r, int f)
 {
 	int t;
 	Prog *p;
@@ -514,11 +466,11 @@ copy1(Adr *v1, Adr *v2, Reg *r, int f)
 	r->active = 1;
 	if(debug['P'])
 		print("copy %D->%D f=%d\n", v1, v2, f);
-	for(; r != R; r = r->s1) {
+	for(; r != nil; r = r->s1) {
 		p = r->prog;
 		if(debug['P'])
 			print("%P", p);
-		if(!f && uniqp(r) == R) {
+		if(!f && uniqp(r) == nil) {
 			f = 1;
 			if(debug['P'])
 				print("; merge; f=%d", f);
@@ -672,7 +624,7 @@ copyu(Prog *p, Adr *v, Adr *s)
 * could be set/use depending on
 * semantics
 */
-int
+static int
 copyas(Adr *a, Adr *v)
 {
 	if(a->type != v->type)
@@ -688,7 +640,7 @@ copyas(Adr *a, Adr *v)
 /*
 * either direct or indirect
 */
-int
+static int
 copyau(Adr *a, Adr *v)
 {

@@ -707,7 +659,7 @@ copyau(Adr *a, Adr *v)
 * substitute s for v in a
 * return failure to substitute
 */
-int
+static int
 copysub(Adr *a, Adr *v, Adr *s, int f)
 {
 	int t;
@@ -740,9 +692,9 @@ copysub(Adr *a, Adr *v, Adr *s, int f)
 }

 static void
-conprop(Reg *r0)
+conprop(Flow *r0)
 {
-	Reg *r;
+	Flow *r;
 	Prog *p, *p0;
 	int t;
 	Adr *v0;
@@ -753,9 +705,9 @@ conprop(Reg *r0)

 loop:
 	r = uniqs(r);
-	if(r == R || r == r0)
+	if(r == nil || r == r0)
 		return;
-	if(uniqp(r) == R)
+	if(uniqp(r) == nil)
 		return;

 	p = r->prog;

--- a/src/cmd/8g/reg.c
+++ b/src/cmd/8g/reg.c
--- a/src/cmd/gc/popt.c
+++ b/src/cmd/gc/popt.c
@@ -181,3 +181,284 @@ fixjmp(Prog *firstp)
 		print("\n");
 	}
 }
+
+// Control flow analysis. The Flow structures hold predecessor and successor
+// information as well as basic loop analysis.
+//
+//	graph = flowstart(firstp, sizeof(Flow));
+//	... use flow graph ...
+//	flowend(graph); // free graph
+//
+// Typical uses of the flow graph are to iterate over all the flow-relevant instructions:
+//
+//	for(f = graph->start; f != nil; f = f->link)
+//
+// or, given an instruction f, to iterate over all the predecessors, which is
+// f->p1 and this list:
+//
+//	for(f2 = f->p2; f2 != nil; f2 = f2->p2link)
+//	
+// Often the Flow struct is embedded as the first field inside a larger struct S.
+// In that case casts are needed to convert Flow* to S* in many places but the
+// idea is the same. Pass sizeof(S) instead of sizeof(Flow) to flowstart.
+
+Graph*
+flowstart(Prog *firstp, int size)
+{
+	int nf;
+	Flow *f, *f1, *start, *last;
+	Graph *graph;
+	Prog *p;
+	ProgInfo info;
+
+	// Count and mark instructions to annotate.
+	nf = 0;
+	for(p = firstp; p != P; p = p->link) {
+		p->opt = nil; // should be already, but just in case
+		proginfo(&info, p);
+		if(info.flags & Skip)
+			continue;
+		p->opt = (void*)1;
+		nf++;
+	}
+	
+	if(nf == 0)
+		return nil;
+
+	if(nf >= 20000) {
+		// fatal("%S is too big (%d instructions)", curfn->nname->sym, nf);
+		return nil;
+	}
+
+	// Allocate annotations and assign to instructions.
+	graph = calloc(sizeof *graph + size*nf, 1);
+	if(graph == nil)
+		fatal("out of memory");
+	start = (Flow*)(graph+1);
+	last = nil;
+	f = start;
+	for(p = firstp; p != P; p = p->link) {
+		if(p->opt == nil)
+			continue;
+		p->opt = f;
+		f->prog = p;
+		if(last)
+			last->link = f;
+		last = f;
+		
+		f = (Flow*)((uchar*)f + size);
+	}
+
+	// Fill in pred/succ information.
+	for(f = start; f != nil; f = f->link) {
+		p = f->prog;
+		proginfo(&info, p);
+		if(!(info.flags & Break)) {
+			f1 = f->link;
+			f->s1 = f1;
+			f1->p1 = f;
+		}
+		if(p->to.type == D_BRANCH) {
+			if(p->to.u.branch == P)
+				fatal("pnil %P", p);
+			f1 = p->to.u.branch->opt;
+			if(f1 == nil)
+				fatal("fnil %P / %P", p, p->to.u.branch);
+			if(f1 == f) {
+				//fatal("self loop %P", p);
+				continue;
+			}
+			f->s2 = f1;
+			f->p2link = f1->p2;
+			f1->p2 = f;
+		}
+	}
+	
+	graph->start = start;
+	graph->num = nf;
+	return graph;
+}
+
+void
+flowend(Graph *graph)
+{
+	Flow *f;
+	
+	for(f = graph->start; f != nil; f = f->link)
+		f->prog->opt = nil;
+	free(graph);
+}
+
+/*
+ * find looping structure
+ *
+ * 1) find reverse postordering
+ * 2) find approximate dominators,
+ *	the actual dominators if the flow graph is reducible
+ *	otherwise, dominators plus some other non-dominators.
+ *	See Matthew S. Hecht and Jeffrey D. Ullman,
+ *	"Analysis of a Simple Algorithm for Global Data Flow Problems",
+ *	Conf.  Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
+ *	Oct. 1-3, 1973, pp.  207-217.
+ * 3) find all nodes with a predecessor dominated by the current node.
+ *	such a node is a loop head.
+ *	recursively, all preds with a greater rpo number are in the loop
+ */
+static int32
+postorder(Flow *r, Flow **rpo2r, int32 n)
+{
+	Flow *r1;
+
+	r->rpo = 1;
+	r1 = r->s1;
+	if(r1 && !r1->rpo)
+		n = postorder(r1, rpo2r, n);
+	r1 = r->s2;
+	if(r1 && !r1->rpo)
+		n = postorder(r1, rpo2r, n);
+	rpo2r[n] = r;
+	n++;
+	return n;
+}
+
+static int32
+rpolca(int32 *idom, int32 rpo1, int32 rpo2)
+{
+	int32 t;
+
+	if(rpo1 == -1)
+		return rpo2;
+	while(rpo1 != rpo2){
+		if(rpo1 > rpo2){
+			t = rpo2;
+			rpo2 = rpo1;
+			rpo1 = t;
+		}
+		while(rpo1 < rpo2){
+			t = idom[rpo2];
+			if(t >= rpo2)
+				fatal("bad idom");
+			rpo2 = t;
+		}
+	}
+	return rpo1;
+}
+
+static int
+doms(int32 *idom, int32 r, int32 s)
+{
+	while(s > r)
+		s = idom[s];
+	return s == r;
+}
+
+static int
+loophead(int32 *idom, Flow *r)
+{
+	int32 src;
+
+	src = r->rpo;
+	if(r->p1 != nil && doms(idom, src, r->p1->rpo))
+		return 1;
+	for(r = r->p2; r != nil; r = r->p2link)
+		if(doms(idom, src, r->rpo))
+			return 1;
+	return 0;
+}
+
+static void
+loopmark(Flow **rpo2r, int32 head, Flow *r)
+{
+	if(r->rpo < head || r->active == head)
+		return;
+	r->active = head;
+	r->loop += LOOP;
+	if(r->p1 != nil)
+		loopmark(rpo2r, head, r->p1);
+	for(r = r->p2; r != nil; r = r->p2link)
+		loopmark(rpo2r, head, r);
+}
+
+void
+flowrpo(Graph *g)
+{
+	Flow *r1;
+	int32 i, d, me, nr, *idom;
+	Flow **rpo2r;
+
+	free(g->rpo);
+	g->rpo = calloc(g->num*sizeof g->rpo[0], 1);
+	idom = calloc(g->num*sizeof idom[0], 1);
+	if(g->rpo == nil || idom == nil)
+		fatal("out of memory");
+
+	rpo2r = g->rpo;
+	d = postorder(g->start, rpo2r, 0);
+	nr = g->num;
+	if(d > nr)
+		fatal("too many reg nodes %d %d", d, nr);
+	nr = d;
+	for(i = 0; i < nr / 2; i++) {
+		r1 = rpo2r[i];
+		rpo2r[i] = rpo2r[nr - 1 - i];
+		rpo2r[nr - 1 - i] = r1;
+	}
+	for(i = 0; i < nr; i++)
+		rpo2r[i]->rpo = i;
+
+	idom[0] = 0;
+	for(i = 0; i < nr; i++) {
+		r1 = rpo2r[i];
+		me = r1->rpo;
+		d = -1;
+		// rpo2r[r->rpo] == r protects against considering dead code,
+		// which has r->rpo == 0.
+		if(r1->p1 != nil && rpo2r[r1->p1->rpo] == r1->p1 && r1->p1->rpo < me)
+			d = r1->p1->rpo;
+		for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
+			if(rpo2r[r1->rpo] == r1 && r1->rpo < me)
+				d = rpolca(idom, d, r1->rpo);
+		idom[i] = d;
+	}
+
+	for(i = 0; i < nr; i++) {
+		r1 = rpo2r[i];
+		r1->loop++;
+		if(r1->p2 != nil && loophead(idom, r1))
+			loopmark(rpo2r, i, r1);
+	}
+	free(idom);
+}
+
+Flow*
+uniqp(Flow *r)
+{
+	Flow *r1;
+
+	r1 = r->p1;
+	if(r1 == nil) {
+		r1 = r->p2;
+		if(r1 == nil || r1->p2link != nil)
+			return nil;
+	} else
+		if(r->p2 != nil)
+			return nil;
+	return r1;
+}
+
+Flow*
+uniqs(Flow *r)
+{
+	Flow *r1;
+
+	r1 = r->s1;
+	if(r1 == nil) {
+		r1 = r->s2;
+		if(r1 == nil)
+			return nil;
+	} else
+		if(r->s2 != nil)
+			return nil;
+	return r1;
+}
+
--- a/src/cmd/gc/popt.h
+++ b/src/cmd/gc/popt.h
@@ -2,5 +2,39 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+typedef struct Flow Flow;
+typedef struct Graph Graph;
+
+struct Flow {
+	Prog*	prog;   	// actual instruction
+	Flow*	p1;     	// predecessors of this instruction: p1,
+	Flow*	p2;     	// and then p2 linked though p2link.
+	Flow*	p2link;
+	Flow*	s1;     	// successors of this instruction (at most two: s1 and s2).
+	Flow*	s2;
+	Flow*	link;   	// next instruction in function code
+	
+	int32	active;	// usable by client
+
+	int32	rpo;		// reverse post ordering
+	uint16	loop;		// x5 for every loop
+	uchar	refset;		// diagnostic generated
+};
+
+struct Graph
+{
+	Flow*	start;
+	int	num;
+	
+	// After calling flowrpo, rpo lists the flow nodes in reverse postorder,
+	// and each non-dead Flow node f has g->rpo[f->rpo] == f.
+	Flow**	rpo;
+};
+
 void	fixjmp(Prog*);
+Graph*	flowstart(Prog*, int);
+void	flowrpo(Graph*);
+void	flowend(Graph*);
 int	noreturn(Prog*);
+Flow*	uniqp(Flow*);
+Flow*	uniqs(Flow*);