[PATCH 5/9] Pass small-aggregate arguments by the AMD64 abi.
[Thread Prev] | [Thread Next]
- Subject: [PATCH 5/9] Pass small-aggregate arguments by the AMD64 abi.
- From: "S. Gilles" <sgilles@xxxxxxx>
- Reply-to: myrddin-dev@xxxxxxxxxxxxxx
- Date: Mon, 8 Jun 2020 00:21:12 -0400
- To: "myrddin-dev" <myrddin-dev@xxxxxxxxxxxxxx>
- Cc: "S. Gilles" <sgilles@xxxxxxx>
Generally “aggregate types smaller than two eightbytes, such that each eightbyte is unambiguously integer or floating-point, are passed in registers”. Since things like structs are assumed to be on the stack (for addressing), this leads to inefficiencies in function calls: the code var foo : bar = [ .a = 123, .b = 4.567 ] baz(foo) will store foo on the stack, then load it into registers to call baz, and the prologue of baz will immediately push foo back onto the stack. Ah, well. --- 6/isel.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++------- 6/locs.c | 2 + 2 files changed, 229 insertions(+), 30 deletions(-) diff --git a/6/isel.c b/6/isel.c index 3dc139b9..3a0129c6 100644 --- a/6/isel.c +++ b/6/isel.c @@ -83,6 +83,28 @@ tymode(Type *t) return ModeNone; } +static Mode +forcefltmode(Mode m) +{ + assert(m != ModeNone); + switch (m) { + case ModeQ: return ModeD; + case ModeD: return ModeD; + default: return ModeF; + } +} + +static Mode +forceintmode(Mode m) +{ + assert(m != ModeNone); + switch (m) { + case ModeD: return ModeQ; + case ModeF: return ModeL; + default: return m; + } +} + static Mode mode(Node *n) { @@ -501,33 +523,87 @@ call(Isel *s, Node *n) } static void -placearg(Isel *s, Node *argn, Loc *argloc, Loc *rsp, int vararg, size_t *nfloats, size_t *nints, size_t *argoff) +placearg(Isel *s, Node *argn, Loc *argloc, PassIn p, Loc *rsp, int vararg, size_t *nfloats, size_t *nints, size_t *argoff) { + /* + placearg may be called when argn is stored at argloc, but it may also + be called when argloc is a small piece of argn, as in the case when + small structs are being passed. In those circumstances, p is PassInSSE + or PassInInt, and argn is irrelevant. Therefore, argn should not be + relied on when p is PassInSSE or PassInInt. + */ Loc *src, *dst; size_t a; - if (stacknode(argn)) { - src = locreg(ModeQ); - g(s, Ilea, argloc, src, NULL); - a = tyalign(exprtype(argn)); - blit(s, rsp, src, *argoff, 0, size(argn), a); - *argoff += size(argn); - } else if (!vararg && isfloatmode(argloc->mode) && *nfloats < Nfloatregargs) { - dst = coreg(floatargregs[*nfloats], argloc->mode); + if (p == PassInNoPref) { + if (stacknode(argn)) { + p = PassInMemory; + } else if (!vararg && isfloatmode(argloc->mode) && *nfloats < Nfloatregargs) { + p = PassInSSE; + } else if (!vararg && isintmode(argloc->mode) && *nints < Nintregargs) { + p = PassInInt; + } else { + p = PassInMemory; + } + } + + switch (p) { + case PassInMemory: + if (stacknode(argn)) { + src = locreg(ModeQ); + g(s, Ilea, argloc, src, NULL); + a = tyalign(exprtype(argn)); + blit(s, rsp, src, *argoff, 0, size(argn), a); + *argoff += size(argn); + } else { + dst = locmem(*argoff, rsp, NULL, argloc->mode); + argloc = inri(s, argloc); + stor(s, argloc, dst); + *argoff += size(argn); + } + break; + case PassInSSE: + dst = coreg(floatargregs[*nfloats], forcefltmode(argloc->mode)); argloc = inri(s, argloc); - g(s, Imovs, argloc, dst, NULL); + if (isfloatmode(argloc->mode)) { + g(s, Imovs, argloc, dst, NULL); + } else { + g(s, Imov, argloc, dst, NULL); + } (*nfloats)++; - } else if (!vararg && isintmode(argloc->mode) && *nints < Nintregargs) { - dst = coreg(intargregs[*nints], argloc->mode); + break; + case PassInInt: + dst = coreg(intargregs[*nints], forceintmode(argloc->mode)); argloc = inri(s, argloc); g(s, Imov, argloc, dst, NULL); (*nints)++; - } else { - dst = locmem(*argoff, rsp, NULL, argloc->mode); - argloc = inri(s, argloc); - stor(s, argloc, dst); - *argoff += size(argn); + break; + case PassInNoPref: /* impossible */ + die("cannot determine how to pass arg"); + break; + } +} + +static int +sufficientregs(PassIn *pc, size_t sz, size_t nfloats, size_t nints) +{ + size_t needed_ints = 0; + size_t needed_floats = 0; + size_t i; + + if (sz > 16) { + return 0; + } + + for (i = 1; i <= sz; i += 8) { + if (pc[i / 8] == PassInInt) { + needed_ints++; + } else if (pc[i / 8] == PassInSSE) { + needed_floats++; + } } + + return (needed_floats + nfloats <= Nfloatregargs) && (needed_ints + nints <= Nintregargs); } static Loc * @@ -535,6 +611,7 @@ gencall(Isel *s, Node *n) { Loc *arg; /* values we reduced */ size_t argsz, argoff, nargs, vasplit; + size_t sz; size_t nfloats, nints; Loc *retloc, *rsp, *ret; /* hard-coded registers */ Loc *stkbump; /* calculated stack offset */ @@ -572,7 +649,12 @@ gencall(Isel *s, Node *n) * one at a time, we evaluate the args in reverse order. * Not good. * - * Skip the first operand, since it's the function itself */ + * Skip the first operand, since it's the function itself + * + * Strictly speaking, we might waste a little space here, + * since some of these args might actually get passed in + * registers. + */ for (i = 0; i < nargs; i++) { argsz = align(argsz, min(size(args[i]), Ptrsz)); argsz += size(args[i]); @@ -590,13 +672,37 @@ gencall(Isel *s, Node *n) vararg = 0; for (i = 0; i < nargs; i++) { arg = selexpr(s, args[i]); - argoff = alignto(argoff, exprtype(args[i])); + t = exprtype(args[i]); + argoff = alignto(argoff, t); + sz = size(args[i]); if (i >= vasplit) vararg = 1; else argoff = align(argoff, 8); - - placearg(s, args[i], arg, rsp, vararg, &nfloats, &nints, &argoff); + if (!vararg && isaggregate(t) && sz <= 16) { + PassIn pc[2] = { PassInNoPref, PassInNoPref }; + classify(t, pc); + if (pc[0] == PassInMemory || pc[1] == PassInMemory || !sufficientregs(pc, sz, nfloats, nints)) { + placearg(s, args[i], arg, PassInMemory, rsp, vararg, &nfloats, &nints, &argoff); + } else { + assert(stacknode(args[i])); + assert(pc[0] != PassInNoPref); + placearg(s, args[i], arg, pc[0], rsp, vararg, &nfloats, &nints, &argoff); + if (size(args[i]) > 8) { + assert(pc[1] != PassInNoPref); + Loc *forcedreg = locreg(ModeQ); + if (arg->type == Loclbl || (arg->type == Locmeml && !arg->mem.base)) { + forcedreg = loclitl(arg->lbl); + } else { + g(s, Ilea, arg, forcedreg, NULL); + } + Loc *argpluseight = locmem(8, forcedreg, NULL, ModeQ); + placearg(s, args[i], argpluseight, pc[1], rsp, vararg, &nfloats, &nints, &argoff); + } + } + } else { + placearg(s, args[i], arg, PassInNoPref, rsp, vararg, &nfloats, &nints, &argoff); + } } call(s, n); if (argsz) @@ -976,26 +1082,55 @@ savedregs[] = { Rnone }; +static Mode +eightbytemode(PassIn p) +{ + if (p == PassInSSE) { + return ModeD; + } + + return ModeQ; +} + + +static void +movearg(Isel *s, Loc *dst, PassIn p, Mode m, size_t *nfloats, size_t *nints, size_t *argoff) +{ + Loc *a; + assert(m != ModeNone); + + switch(p) { + case PassInInt: + a = coreg(intargregs[*nints], forceintmode(m)); + g(s, Imov, a, dst, NULL); + (*nints)++; + break; + case PassInSSE: + a = coreg(floatargregs[*nfloats], forcefltmode(m)); + g(s, Imovs, a, dst, NULL); + (*nfloats)++; + break; + default: /* no need to move if on stack */ + break; + } +} + static void retrievearg(Isel *s, Node *argn, int vararg, size_t *nfloats, size_t *nints, size_t *argoff) { - Loc *a, *l; + Loc *l; if (stacknode(argn)) { htput(s->stkoff, argn, itop(-(*argoff + 2*Ptrsz))); *argoff += size(argn); } else if (!vararg && isfloatmode(mode(argn)) && *nfloats < Nfloatregargs) { - a = coreg(floatargregs[*nfloats], mode(argn)); l = loc(s, argn); - g(s, Imovs, a, l, NULL); + movearg(s, l, PassInSSE, forcefltmode(mode(argn)), nfloats, nints, argoff); htput(s->reglocs, argn, l); - (*nfloats)++; } else if (!vararg && isintmode(mode(argn)) && *nints < Nintregargs) { - a = coreg(intargregs[*nints], mode(argn)); l = loc(s, argn); - g(s, Imov, a, l, NULL); + movearg(s, l, PassInInt, forceintmode(mode(argn)), nfloats, nints, argoff); htput(s->reglocs, argn, l); - (*nints)++; } else if (tybase(decltype(argn))->type != Tyvoid) { /* varargs go on the stack */ htput(s->stkoff, argn, itop(-(*argoff + 2*Ptrsz))); @@ -1008,8 +1143,10 @@ addarglocs(Isel *s, Func *fn) { size_t i, nints, nfloats, nargs; size_t argoff; + size_t sz; int vararg; Node *arg; + Type *t; argoff = 0; nfloats = 0; @@ -1018,13 +1155,41 @@ addarglocs(Isel *s, Func *fn) nargs = countargs(fn->type); for (i = 0; i < fn->nargs; i++) { arg = fn->args[i]; - argoff = alignto(argoff, decltype(arg)); + t = decltype(arg); + argoff = alignto(argoff, t); + sz = size(arg); if (i >= nargs) vararg = 1; else argoff = align(argoff, 8); - retrievearg(s, arg, vararg, &nfloats, &nints, &argoff); + if (!vararg && isaggregate(t) && sz <= 16) { + PassIn pc[2] = { PassInNoPref, PassInNoPref }; + classify(t, pc); + if (pc[0] == PassInMemory || pc[1] == PassInMemory || !sufficientregs(pc, sz, nfloats, nints)) { + retrievearg(s, arg, vararg, &nfloats, &nints, &argoff); + } else { + /* + * Needs to be carefully put back on the stack, not simply located among + * registers. loc will succeed because the space was reserved in + * handlesmallstructargs. + */ + Loc *l = loc(s, arg); + movearg(s, l, pc[0], eightbytemode(pc[0]), &nfloats, &nints, &argoff); + if (size(arg) > 8) { + Loc *forcedreg = locreg(ModeQ); + if (l->type == Loclbl || (l->type == Locmeml && !l->mem.base)) { + forcedreg = loclitl(l->lbl); + } else { + g(s, Ilea, l, forcedreg, NULL); + } + Loc *lpluseight = locmem(8, forcedreg, NULL, ModeQ); + movearg(s, lpluseight, pc[1], eightbytemode(pc[1]), &nfloats, &nints, &argoff); + } + } + } else { + retrievearg(s, arg, vararg, &nfloats, &nints, &argoff); + } } } @@ -1107,6 +1272,37 @@ mkasmbb(Bb *bb) return as; } +static void +handlesmallstructargs(Isel *is, Func *fn) +{ + /* + * Perform a last-minute adjustment to fn->stksz to handle small structs + * that will be passed in registers. We do this inside selfunc so that + * generics will be specialized. + */ + size_t vasplit = countargs(fn->type); + size_t i = 0, sz = 0; + Type *t; + Node *arg; + + for (i = 0; i < fn->nargs; i++) { + arg = fn->args[i]; + t = decltype(arg); + sz = size(arg); + + if (i < vasplit && isaggregate(t) && sz <= 16) { + PassIn pc[2] = { PassInNoPref, PassInNoPref }; + classify(t, pc); + if (pc[0] != PassInMemory && pc[1] != PassInMemory) { + sz = align(sz, 8); + fn->stksz += sz; + fn->stksz = align(fn->stksz, min(sz, Ptrsz)); + htput(fn->stkoff, fn->args[i], itop(fn->stksz)); + } + } + } +} + void selfunc(Isel *is, Func *fn, Htab *globls, Htab *strtab) { @@ -1132,6 +1328,7 @@ selfunc(Isel *is, Func *fn, Htab *globls, Htab *strtab) g(is, Iloc, locstrlbl(buf), NULL); } + handlesmallstructargs(is, fn); prologue(is, fn, fn->stksz); lastline = -1; for (j = 0; j < fn->cfg->nbb - 1; j++) { diff --git a/6/locs.c b/6/locs.c index aa26f01e..b314cd9a 100644 --- a/6/locs.c +++ b/6/locs.c @@ -176,6 +176,8 @@ loclit(long val, Mode m) Loc * coreg(Reg r, Mode m) { + assert(m != ModeNone); + Reg crtab[][Nmode + 1] = { [Ral] = {Rnone, Ral, Rax, Reax, Rrax}, [Rcl] = {Rnone, Rcl, Rcx, Recx, Rrcx}, -- 2.26.2
[PATCH 0/9] Handle small-aggregates via AMD64 abi | "S. Gilles" <sgilles@xxxxxxx> |
- Prev by Date: [PATCH 4/9] Add classification algorithm for small-struct passing.
- Next by Date: [PATCH 7/9] Bump abi version.
- Previous by thread: [PATCH 4/9] Add classification algorithm for small-struct passing.
- Next by thread: [PATCH 7/9] Bump abi version.
- Index(es):