diff --git a/src/cmd/6g/ggen.c b/src/cmd/6g/ggen.c
index 987473ccab60908acd7d4cbea6fe74c8bbd79c90..363620769d911d414c57d85ab6055e4b16c68a97 100644
--- a/src/cmd/6g/ggen.c
+++ b/src/cmd/6g/ggen.c
@@ -1102,26 +1102,54 @@ clearfat(Node *nl)
 	c = w % 8;	// bytes
 	q = w / 8;	// quads
 
+	if(q < 4) {
+		// Write sequence of MOV 0, off(base) instead of using STOSQ.
+		// The hope is that although the code will be slightly longer,
+		// the MOVs will have no dependencies and pipeline better
+		// than the unrolled STOSQ loop.
+		// NOTE: Must use agen, not igen, so that optimizer sees address
+		// being taken. We are not writing on field boundaries.
+		agenr(nl, &n1, N);
+		n1.op = OINDREG;
+		nodconst(&z, types[TUINT64], 0);
+		while(q-- > 0) {
+			n1.type = z.type;
+			gins(AMOVQ, &z, &n1);
+			n1.xoffset += 8;
+		}
+		if(c >= 4) {
+			nodconst(&z, types[TUINT32], 0);
+			n1.type = z.type;
+			gins(AMOVL, &z, &n1);
+			n1.xoffset += 4;
+			c -= 4;
+		}
+		nodconst(&z, types[TUINT8], 0);
+		while(c-- > 0) {
+			n1.type = z.type;
+			gins(AMOVB, &z, &n1);
+			n1.xoffset++;
+		}
+		regfree(&n1);
+		return;
+	}
+
 	savex(D_DI, &n1, &oldn1, N, types[tptr]);
 	agen(nl, &n1);
 
 	savex(D_AX, &ax, &oldax, N, types[tptr]);
 	gconreg(AMOVL, 0, D_AX);
 
-	if(q > 128 || (q >= 4 && nacl)) {
+	if(q > 128 || nacl) {
 		gconreg(movptr, q, D_CX);
 		gins(AREP, N, N);	// repeat
 		gins(ASTOSQ, N, N);	// STOQ AL,*(DI)+
-	} else if(q >= 4) {
+	} else {
 		p = gins(ADUFFZERO, N, N);
 		p->to.type = D_ADDR;
 		p->to.sym = linksym(pkglookup("duffzero", runtimepkg));
 		// 2 and 128 = magic constants: see ../../runtime/asm_amd64.s
 		p->to.offset = 2*(128-q);
-	} else
-	while(q > 0) {
-		gins(ASTOSQ, N, N);	// STOQ AL,*(DI)+
-		q--;
 	}
 
 	z = ax;
diff --git a/src/cmd/8g/ggen.c b/src/cmd/8g/ggen.c
index 7c986cc6457b4c7d207800fefcd1f6ba9e1e5790..6333a60bb8a7d433dee97f4bfdc606933924fce6 100644
--- a/src/cmd/8g/ggen.c
+++ b/src/cmd/8g/ggen.c
@@ -157,7 +157,7 @@ void
 clearfat(Node *nl)
 {
 	uint32 w, c, q;
-	Node n1;
+	Node n1, z;
 	Prog *p;
 
 	/* clear a fat object */
@@ -172,6 +172,32 @@ clearfat(Node *nl)
 	c = w % 4;	// bytes
 	q = w / 4;	// quads
 
+	if(q < 4) {
+		// Write sequence of MOV 0, off(base) instead of using STOSL.
+		// The hope is that although the code will be slightly longer,
+		// the MOVs will have no dependencies and pipeline better
+		// than the unrolled STOSL loop.
+		// NOTE: Must use agen, not igen, so that optimizer sees address
+		// being taken. We are not writing on field boundaries.
+		regalloc(&n1, types[tptr], N);
+		agen(nl, &n1);
+		n1.op = OINDREG;
+		nodconst(&z, types[TUINT64], 0);
+		while(q-- > 0) {
+			n1.type = z.type;
+			gins(AMOVL, &z, &n1);
+			n1.xoffset += 4;
+		}
+		nodconst(&z, types[TUINT8], 0);
+		while(c-- > 0) {
+			n1.type = z.type;
+			gins(AMOVB, &z, &n1);
+			n1.xoffset++;
+		}
+		regfree(&n1);
+		return;
+	}
+
 	nodreg(&n1, types[tptr], D_DI);
 	agen(nl, &n1);
 	gconreg(AMOVL, 0, D_AX);
diff --git a/src/cmd/gc/gen.c b/src/cmd/gc/gen.c
index eb9eacca8f133d39c73ff2553d32b3dc27fc6b9f..a7db833a1a32a8a2bac91031ea41636e88190f8f 100644
--- a/src/cmd/gc/gen.c
+++ b/src/cmd/gc/gen.c
@@ -731,7 +731,7 @@ cgen_as(Node *nl, Node *nr)
 		return;
 	}
 
-	if(nr == N || isnil(nr)) {
+	if(nr == N || iszero(nr)) {
 		// externals and heaps should already be clear
 		if(nr == N) {
 			if(nl->class == PEXTERN)
diff --git a/src/cmd/gc/go.h b/src/cmd/gc/go.h
index 8178f7272f5427f7409e912f1f0c613fb0baf06d..475754145b18216ff990f7ebde0e1cd80d163ae7 100644
--- a/src/cmd/gc/go.h
+++ b/src/cmd/gc/go.h
@@ -1374,6 +1374,7 @@ int	isnilinter(Type *t);
 int	isptrto(Type *t, int et);
 int	isslice(Type *t);
 int	istype(Type *t, int et);
+int	iszero(Node *n);
 void	linehist(char *file, int32 off, int relative);
 NodeList*	list(NodeList *l, Node *n);
 NodeList*	list1(Node *n);
diff --git a/src/cmd/gc/mparith2.c b/src/cmd/gc/mparith2.c
index 5cf98c62c634cdfd91ae3328a69f57c662b3452c..fd9f591ceae2ac164b48b6be4ffdad8a04af04b4 100644
--- a/src/cmd/gc/mparith2.c
+++ b/src/cmd/gc/mparith2.c
@@ -656,7 +656,7 @@ mpdivmodfixfix(Mpint *q, Mpint *r, Mpint *n, Mpint *d)
 }
 
 static int
-iszero(Mpint *a)
+mpiszero(Mpint *a)
 {
 	long *a1;
 	int i;
@@ -687,7 +687,7 @@ mpdivfract(Mpint *a, Mpint *b)
 		for(j=0; j<Mpscale; j++) {
 			x <<= 1;
 			if(mpcmp(&d, &n) <= 0) {
-				if(!iszero(&d))
+				if(!mpiszero(&d))
 					x |= 1;
 				mpsubfixfix(&n, &d);
 			}
diff --git a/src/cmd/gc/sinit.c b/src/cmd/gc/sinit.c
index f050026d9d07d570e71b909657f698b0b1a9d64e..2a811513c9b3e257f43e81b13f2d65fe71214e5f 100644
--- a/src/cmd/gc/sinit.c
+++ b/src/cmd/gc/sinit.c
@@ -17,7 +17,6 @@ enum
 	InitPending = 2,
 };
 
-static int iszero(Node*);
 static void initplan(Node*);
 static NodeList *initlist;
 static void init2(Node*, NodeList**);
@@ -1356,7 +1355,6 @@ no:
 	return 0;
 }
 
-static int iszero(Node*);
 static int isvaluelit(Node*);
 static InitEntry* entry(InitPlan*);
 static void addvalue(InitPlan*, vlong, Node*, Node*);
@@ -1440,7 +1438,7 @@ addvalue(InitPlan *p, vlong xoffset, Node *key, Node *n)
 	e->expr = n;
 }
 
-static int
+int
 iszero(Node *n)
 {
 	NodeList *l;
diff --git a/src/cmd/gc/walk.c b/src/cmd/gc/walk.c
index 241d7d74adbc9029c12a02c12de63e0c0fbbd628..7f2748c668d05f0e428a1f2d4bafc0fdf25e436e 100644
--- a/src/cmd/gc/walk.c
+++ b/src/cmd/gc/walk.c
@@ -1390,7 +1390,12 @@ walkexpr(Node **np, NodeList **init)
 	case OMAPLIT:
 	case OSTRUCTLIT:
 	case OPTRLIT:
-		// XXX TODO do we need to clear var?
+		// NOTE(rsc): Race detector cannot handle seeing
+		// a STRUCTLIT or ARRAYLIT representing a zero value,
+		// so make a temporary for those always in race mode.
+		// Otherwise, leave zero values in place.
+		if(iszero(n) && !flag_race)
+			goto ret;
 		var = temp(n->type);
 		anylit(0, n, var, init);
 		n = var;
@@ -2009,8 +2014,8 @@ needwritebarrier(Node *l, Node *r)
 	if(isstack(l))
 		return 0;
 
-	// No write barrier for zeroing.
-	if(r == N)
+	// No write barrier for implicit or explicit zeroing.
+	if(r == N || iszero(r))
 		return 0;
 
 	// No write barrier for initialization to constant.