diff --git a/src/cmd/6g/ggen.c b/src/cmd/6g/ggen.c index 987473ccab60908acd7d4cbea6fe74c8bbd79c90..363620769d911d414c57d85ab6055e4b16c68a97 100644 --- a/src/cmd/6g/ggen.c +++ b/src/cmd/6g/ggen.c @@ -1102,26 +1102,54 @@ clearfat(Node *nl) c = w % 8; // bytes q = w / 8; // quads + if(q < 4) { + // Write sequence of MOV 0, off(base) instead of using STOSQ. + // The hope is that although the code will be slightly longer, + // the MOVs will have no dependencies and pipeline better + // than the unrolled STOSQ loop. + // NOTE: Must use agen, not igen, so that optimizer sees address + // being taken. We are not writing on field boundaries. + agenr(nl, &n1, N); + n1.op = OINDREG; + nodconst(&z, types[TUINT64], 0); + while(q-- > 0) { + n1.type = z.type; + gins(AMOVQ, &z, &n1); + n1.xoffset += 8; + } + if(c >= 4) { + nodconst(&z, types[TUINT32], 0); + n1.type = z.type; + gins(AMOVL, &z, &n1); + n1.xoffset += 4; + c -= 4; + } + nodconst(&z, types[TUINT8], 0); + while(c-- > 0) { + n1.type = z.type; + gins(AMOVB, &z, &n1); + n1.xoffset++; + } + regfree(&n1); + return; + } + savex(D_DI, &n1, &oldn1, N, types[tptr]); agen(nl, &n1); savex(D_AX, &ax, &oldax, N, types[tptr]); gconreg(AMOVL, 0, D_AX); - if(q > 128 || (q >= 4 && nacl)) { + if(q > 128 || nacl) { gconreg(movptr, q, D_CX); gins(AREP, N, N); // repeat gins(ASTOSQ, N, N); // STOQ AL,*(DI)+ - } else if(q >= 4) { + } else { p = gins(ADUFFZERO, N, N); p->to.type = D_ADDR; p->to.sym = linksym(pkglookup("duffzero", runtimepkg)); // 2 and 128 = magic constants: see ../../runtime/asm_amd64.s p->to.offset = 2*(128-q); - } else - while(q > 0) { - gins(ASTOSQ, N, N); // STOQ AL,*(DI)+ - q--; } z = ax; diff --git a/src/cmd/8g/ggen.c b/src/cmd/8g/ggen.c index 7c986cc6457b4c7d207800fefcd1f6ba9e1e5790..6333a60bb8a7d433dee97f4bfdc606933924fce6 100644 --- a/src/cmd/8g/ggen.c +++ b/src/cmd/8g/ggen.c @@ -157,7 +157,7 @@ void clearfat(Node *nl) { uint32 w, c, q; - Node n1; + Node n1, z; Prog *p; /* clear a fat object */ @@ -172,6 +172,32 @@ clearfat(Node *nl) c = w % 4; // bytes q = w / 4; // quads + if(q < 4) { + // Write sequence of MOV 0, off(base) instead of using STOSL. + // The hope is that although the code will be slightly longer, + // the MOVs will have no dependencies and pipeline better + // than the unrolled STOSL loop. + // NOTE: Must use agen, not igen, so that optimizer sees address + // being taken. We are not writing on field boundaries. + regalloc(&n1, types[tptr], N); + agen(nl, &n1); + n1.op = OINDREG; + nodconst(&z, types[TUINT64], 0); + while(q-- > 0) { + n1.type = z.type; + gins(AMOVL, &z, &n1); + n1.xoffset += 4; + } + nodconst(&z, types[TUINT8], 0); + while(c-- > 0) { + n1.type = z.type; + gins(AMOVB, &z, &n1); + n1.xoffset++; + } + regfree(&n1); + return; + } + nodreg(&n1, types[tptr], D_DI); agen(nl, &n1); gconreg(AMOVL, 0, D_AX); diff --git a/src/cmd/gc/gen.c b/src/cmd/gc/gen.c index eb9eacca8f133d39c73ff2553d32b3dc27fc6b9f..a7db833a1a32a8a2bac91031ea41636e88190f8f 100644 --- a/src/cmd/gc/gen.c +++ b/src/cmd/gc/gen.c @@ -731,7 +731,7 @@ cgen_as(Node *nl, Node *nr) return; } - if(nr == N || isnil(nr)) { + if(nr == N || iszero(nr)) { // externals and heaps should already be clear if(nr == N) { if(nl->class == PEXTERN) diff --git a/src/cmd/gc/go.h b/src/cmd/gc/go.h index 8178f7272f5427f7409e912f1f0c613fb0baf06d..475754145b18216ff990f7ebde0e1cd80d163ae7 100644 --- a/src/cmd/gc/go.h +++ b/src/cmd/gc/go.h @@ -1374,6 +1374,7 @@ int isnilinter(Type *t); int isptrto(Type *t, int et); int isslice(Type *t); int istype(Type *t, int et); +int iszero(Node *n); void linehist(char *file, int32 off, int relative); NodeList* list(NodeList *l, Node *n); NodeList* list1(Node *n); diff --git a/src/cmd/gc/mparith2.c b/src/cmd/gc/mparith2.c index 5cf98c62c634cdfd91ae3328a69f57c662b3452c..fd9f591ceae2ac164b48b6be4ffdad8a04af04b4 100644 --- a/src/cmd/gc/mparith2.c +++ b/src/cmd/gc/mparith2.c @@ -656,7 +656,7 @@ mpdivmodfixfix(Mpint *q, Mpint *r, Mpint *n, Mpint *d) } static int -iszero(Mpint *a) +mpiszero(Mpint *a) { long *a1; int i; @@ -687,7 +687,7 @@ mpdivfract(Mpint *a, Mpint *b) for(j=0; j<Mpscale; j++) { x <<= 1; if(mpcmp(&d, &n) <= 0) { - if(!iszero(&d)) + if(!mpiszero(&d)) x |= 1; mpsubfixfix(&n, &d); } diff --git a/src/cmd/gc/sinit.c b/src/cmd/gc/sinit.c index f050026d9d07d570e71b909657f698b0b1a9d64e..2a811513c9b3e257f43e81b13f2d65fe71214e5f 100644 --- a/src/cmd/gc/sinit.c +++ b/src/cmd/gc/sinit.c @@ -17,7 +17,6 @@ enum InitPending = 2, }; -static int iszero(Node*); static void initplan(Node*); static NodeList *initlist; static void init2(Node*, NodeList**); @@ -1356,7 +1355,6 @@ no: return 0; } -static int iszero(Node*); static int isvaluelit(Node*); static InitEntry* entry(InitPlan*); static void addvalue(InitPlan*, vlong, Node*, Node*); @@ -1440,7 +1438,7 @@ addvalue(InitPlan *p, vlong xoffset, Node *key, Node *n) e->expr = n; } -static int +int iszero(Node *n) { NodeList *l; diff --git a/src/cmd/gc/walk.c b/src/cmd/gc/walk.c index 241d7d74adbc9029c12a02c12de63e0c0fbbd628..7f2748c668d05f0e428a1f2d4bafc0fdf25e436e 100644 --- a/src/cmd/gc/walk.c +++ b/src/cmd/gc/walk.c @@ -1390,7 +1390,12 @@ walkexpr(Node **np, NodeList **init) case OMAPLIT: case OSTRUCTLIT: case OPTRLIT: - // XXX TODO do we need to clear var? + // NOTE(rsc): Race detector cannot handle seeing + // a STRUCTLIT or ARRAYLIT representing a zero value, + // so make a temporary for those always in race mode. + // Otherwise, leave zero values in place. + if(iszero(n) && !flag_race) + goto ret; var = temp(n->type); anylit(0, n, var, init); n = var; @@ -2009,8 +2014,8 @@ needwritebarrier(Node *l, Node *r) if(isstack(l)) return 0; - // No write barrier for zeroing. - if(r == N) + // No write barrier for implicit or explicit zeroing. + if(r == N || iszero(r)) return 0; // No write barrier for initialization to constant.