[PATCH 5/9] Pass small-aggregate arguments by the AMD64 abi.
  [Thread Prev] | [Thread Next]
 
 
- Subject: [PATCH 5/9] Pass small-aggregate arguments by the AMD64 abi.
- From: "S. Gilles" <sgilles@xxxxxxx>
- Reply-to: myrddin-dev@xxxxxxxxxxxxxx
- Date: Mon, 8 Jun 2020 00:21:12 -0400
- To: "myrddin-dev" <myrddin-dev@xxxxxxxxxxxxxx>
- Cc: "S. Gilles" <sgilles@xxxxxxx>
Generally “aggregate types smaller than two eightbytes, such that each
eightbyte is unambiguously integer or floating-point, are passed in
registers”.
Since things like structs are assumed to be on the stack (for
addressing), this leads to inefficiencies in function calls: the code
    var foo : bar = [ .a = 123, .b = 4.567 ]
    baz(foo)
will store foo on the stack, then load it into registers to call baz,
and the prologue of baz will immediately push foo back onto the stack.
Ah, well.
---
 6/isel.c | 257 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 6/locs.c |   2 +
 2 files changed, 229 insertions(+), 30 deletions(-)
diff --git a/6/isel.c b/6/isel.c
index 3dc139b9..3a0129c6 100644
--- a/6/isel.c
+++ b/6/isel.c
@@ -83,6 +83,28 @@ tymode(Type *t)
 	return ModeNone;
 }
 
+static Mode
+forcefltmode(Mode m)
+{
+	assert(m != ModeNone);
+	switch (m) {
+	case ModeQ: return ModeD;
+	case ModeD: return ModeD;
+	default: return ModeF;
+	}
+}
+
+static Mode
+forceintmode(Mode m)
+{
+	assert(m != ModeNone);
+	switch (m) {
+	case ModeD: return ModeQ;
+	case ModeF: return ModeL;
+	default: return m;
+	}
+}
+
 static Mode
 mode(Node *n)
 {
@@ -501,33 +523,87 @@ call(Isel *s, Node *n)
 }
 
 static void
-placearg(Isel *s, Node *argn, Loc *argloc, Loc *rsp, int vararg, size_t *nfloats, size_t *nints, size_t *argoff)
+placearg(Isel *s, Node *argn, Loc *argloc, PassIn p, Loc *rsp, int vararg, size_t *nfloats, size_t *nints, size_t *argoff)
 {
+	/*
+	   placearg may be called when argn is stored at argloc, but it may also
+	   be called when argloc is a small piece of argn, as in the case when
+	   small structs are being passed. In those circumstances, p is PassInSSE
+	   or PassInInt, and argn is irrelevant. Therefore, argn should not be
+	   relied on when p is PassInSSE or PassInInt.
+	 */
 	Loc *src, *dst;
 	size_t a;
 
-	if (stacknode(argn)) {
-		src = locreg(ModeQ);
-		g(s, Ilea, argloc, src, NULL);
-		a = tyalign(exprtype(argn));
-		blit(s, rsp, src, *argoff, 0, size(argn), a);
-		*argoff += size(argn);
-	} else if (!vararg && isfloatmode(argloc->mode) && *nfloats < Nfloatregargs) {
-		dst = coreg(floatargregs[*nfloats], argloc->mode);
+	if (p == PassInNoPref) {
+		if (stacknode(argn)) {
+			p = PassInMemory;
+		} else if (!vararg && isfloatmode(argloc->mode) && *nfloats < Nfloatregargs) {
+			p = PassInSSE;
+		} else if (!vararg && isintmode(argloc->mode) && *nints < Nintregargs) {
+			p = PassInInt;
+		} else {
+			p = PassInMemory;
+		}
+	}
+
+	switch (p) {
+	case PassInMemory:
+		if (stacknode(argn)) {
+			src = locreg(ModeQ);
+			g(s, Ilea, argloc, src, NULL);
+			a = tyalign(exprtype(argn));
+			blit(s, rsp, src, *argoff, 0, size(argn), a);
+			*argoff += size(argn);
+		} else {
+			dst = locmem(*argoff, rsp, NULL, argloc->mode);
+			argloc = inri(s, argloc);
+			stor(s, argloc, dst);
+			*argoff += size(argn);
+		}
+		break;
+	case PassInSSE:
+		dst = coreg(floatargregs[*nfloats], forcefltmode(argloc->mode));
 		argloc = inri(s, argloc);
-		g(s, Imovs, argloc, dst, NULL);
+		if (isfloatmode(argloc->mode)) {
+			g(s, Imovs, argloc, dst, NULL);
+		} else {
+			g(s, Imov, argloc, dst, NULL);
+		}
 		(*nfloats)++;
-	} else if (!vararg && isintmode(argloc->mode) && *nints < Nintregargs) {
-		dst = coreg(intargregs[*nints], argloc->mode);
+		break;
+	case PassInInt:
+		dst = coreg(intargregs[*nints], forceintmode(argloc->mode));
 		argloc = inri(s, argloc);
 		g(s, Imov, argloc, dst, NULL);
 		(*nints)++;
-	} else {
-		dst = locmem(*argoff, rsp, NULL, argloc->mode);
-		argloc = inri(s, argloc);
-		stor(s, argloc, dst);
-		*argoff += size(argn);
+		break;
+	case PassInNoPref: /* impossible */
+		die("cannot determine how to pass arg");
+		break;
+	}
+}
+
+static int
+sufficientregs(PassIn *pc, size_t sz, size_t nfloats, size_t nints)
+{
+	size_t needed_ints = 0;
+	size_t needed_floats = 0;
+	size_t i;
+
+	if (sz > 16) {
+		return 0;
+	}
+
+	for (i = 1; i <= sz; i += 8) {
+		if (pc[i / 8] == PassInInt) {
+			needed_ints++;
+		} else if (pc[i / 8] == PassInSSE) {
+			needed_floats++;
+		}
 	}
+
+	return (needed_floats + nfloats <= Nfloatregargs) && (needed_ints + nints <= Nintregargs);
 }
 
 static Loc *
@@ -535,6 +611,7 @@ gencall(Isel *s, Node *n)
 {
 	Loc *arg;	/* values we reduced */
 	size_t argsz, argoff, nargs, vasplit;
+	size_t sz;
 	size_t nfloats, nints;
 	Loc *retloc, *rsp, *ret;	/* hard-coded registers */
 	Loc *stkbump;	/* calculated stack offset */
@@ -572,7 +649,12 @@ gencall(Isel *s, Node *n)
 	 * one at a time, we evaluate the args in reverse order.
 	 * Not good.
 	 *
-	 * Skip the first operand, since it's the function itself */
+	 * Skip the first operand, since it's the function itself
+	 *
+	 * Strictly speaking, we might waste a little space here,
+	 * since some of these args might actually get passed in
+	 * registers.
+	 */
 	for (i = 0; i < nargs; i++) {
 		argsz = align(argsz, min(size(args[i]), Ptrsz));
 		argsz += size(args[i]);
@@ -590,13 +672,37 @@ gencall(Isel *s, Node *n)
 	vararg = 0;
 	for (i = 0; i < nargs; i++) {
 		arg = selexpr(s, args[i]);
-		argoff = alignto(argoff, exprtype(args[i]));
+		t = exprtype(args[i]);
+		argoff = alignto(argoff, t);
+		sz = size(args[i]);
 		if (i >= vasplit)
 			vararg = 1;
 		else
 			argoff = align(argoff, 8);
-
-		placearg(s, args[i], arg, rsp, vararg, &nfloats, &nints, &argoff);
+		if (!vararg && isaggregate(t) && sz <= 16) {
+			PassIn pc[2] = { PassInNoPref, PassInNoPref };
+			classify(t, pc);
+			if (pc[0] == PassInMemory || pc[1] == PassInMemory || !sufficientregs(pc, sz, nfloats, nints)) {
+				placearg(s, args[i], arg, PassInMemory, rsp, vararg, &nfloats, &nints, &argoff);
+			} else {
+				assert(stacknode(args[i]));
+				assert(pc[0] != PassInNoPref);
+				placearg(s, args[i], arg, pc[0], rsp, vararg, &nfloats, &nints, &argoff);
+				if (size(args[i]) > 8) {
+					assert(pc[1] != PassInNoPref);
+					Loc *forcedreg = locreg(ModeQ);
+					if (arg->type == Loclbl || (arg->type == Locmeml && !arg->mem.base)) {
+						forcedreg = loclitl(arg->lbl);
+					} else {
+						g(s, Ilea, arg, forcedreg, NULL);
+					}
+					Loc *argpluseight = locmem(8, forcedreg, NULL, ModeQ);
+					placearg(s, args[i], argpluseight, pc[1], rsp, vararg, &nfloats, &nints, &argoff);
+				}
+			}
+		} else {
+			placearg(s, args[i], arg, PassInNoPref, rsp, vararg, &nfloats, &nints, &argoff);
+		}
 	}
 	call(s, n);
 	if (argsz)
@@ -976,26 +1082,55 @@ savedregs[] = {
 	Rnone
 };
 
+static Mode
+eightbytemode(PassIn p)
+{
+	if (p == PassInSSE) {
+		return ModeD;
+	}
+
+	return ModeQ;
+}
+
+
+static void
+movearg(Isel *s, Loc *dst, PassIn p, Mode m, size_t *nfloats, size_t *nints, size_t *argoff)
+{
+	Loc *a;
+	assert(m != ModeNone);
+
+	switch(p) {
+	case PassInInt:
+		a = coreg(intargregs[*nints], forceintmode(m));
+		g(s, Imov, a, dst, NULL);
+		(*nints)++;
+		break;
+	case PassInSSE:
+		a = coreg(floatargregs[*nfloats], forcefltmode(m));
+		g(s, Imovs, a, dst, NULL);
+		(*nfloats)++;
+		break;
+	default: /* no need to move if on stack */
+		break;
+	}
+}
+
 static void
 retrievearg(Isel *s, Node *argn, int vararg, size_t *nfloats, size_t *nints, size_t *argoff)
 {
-	Loc *a, *l;
+	Loc *l;
 
 	if (stacknode(argn)) {
 		htput(s->stkoff, argn, itop(-(*argoff + 2*Ptrsz)));
 		*argoff += size(argn);
 	} else if (!vararg && isfloatmode(mode(argn)) && *nfloats < Nfloatregargs) {
-		a = coreg(floatargregs[*nfloats], mode(argn));
 		l = loc(s, argn);
-		g(s, Imovs, a, l, NULL);
+		movearg(s, l, PassInSSE, forcefltmode(mode(argn)), nfloats, nints, argoff);
 		htput(s->reglocs, argn, l);
-		(*nfloats)++;
 	} else if (!vararg && isintmode(mode(argn)) && *nints < Nintregargs) {
-		a = coreg(intargregs[*nints], mode(argn));
 		l = loc(s, argn);
-		g(s, Imov, a, l, NULL);
+		movearg(s, l, PassInInt, forceintmode(mode(argn)), nfloats, nints, argoff);
 		htput(s->reglocs, argn, l);
-		(*nints)++;
 	} else if (tybase(decltype(argn))->type != Tyvoid) {
 		/* varargs go on the stack */
 		htput(s->stkoff, argn, itop(-(*argoff + 2*Ptrsz)));
@@ -1008,8 +1143,10 @@ addarglocs(Isel *s, Func *fn)
 {
 	size_t i, nints, nfloats, nargs;
 	size_t argoff;
+	size_t sz;
 	int vararg;
 	Node *arg;
+	Type *t;
 
 	argoff = 0;
 	nfloats = 0;
@@ -1018,13 +1155,41 @@ addarglocs(Isel *s, Func *fn)
 	nargs = countargs(fn->type);
 	for (i = 0; i < fn->nargs; i++) {
 		arg = fn->args[i];
-		argoff = alignto(argoff, decltype(arg));
+		t = decltype(arg);
+		argoff = alignto(argoff, t);
+		sz = size(arg);
 		if (i >= nargs)
 			vararg = 1;
 		else
 			argoff = align(argoff, 8);
 
-		retrievearg(s, arg, vararg, &nfloats, &nints, &argoff);
+		if (!vararg && isaggregate(t) && sz <= 16) {
+			PassIn pc[2] = { PassInNoPref, PassInNoPref };
+			classify(t, pc);
+			if (pc[0] == PassInMemory || pc[1] == PassInMemory || !sufficientregs(pc, sz, nfloats, nints)) {
+				retrievearg(s, arg, vararg, &nfloats, &nints, &argoff);
+			} else {
+				/*
+				 * Needs to be carefully put back on the stack, not simply located among
+				 * registers. loc will succeed because the space was reserved in
+				 * handlesmallstructargs.
+				 */
+				Loc *l = loc(s, arg);
+				movearg(s, l, pc[0], eightbytemode(pc[0]), &nfloats, &nints, &argoff);
+				if (size(arg) > 8) {
+					Loc *forcedreg = locreg(ModeQ);
+					if (l->type == Loclbl || (l->type == Locmeml && !l->mem.base)) {
+						forcedreg = loclitl(l->lbl);
+					} else {
+						g(s, Ilea, l, forcedreg, NULL);
+					}
+					Loc *lpluseight = locmem(8, forcedreg, NULL, ModeQ);
+					movearg(s, lpluseight, pc[1], eightbytemode(pc[1]), &nfloats, &nints, &argoff);
+				}
+			}
+		} else {
+			retrievearg(s, arg, vararg, &nfloats, &nints, &argoff);
+		}
 	}
 }
 
@@ -1107,6 +1272,37 @@ mkasmbb(Bb *bb)
 	return as;
 }
 
+static void
+handlesmallstructargs(Isel *is, Func *fn)
+{
+	/*
+	 * Perform a last-minute adjustment to fn->stksz to handle small structs
+	 * that will be passed in registers. We do this inside selfunc so that
+	 * generics will be specialized.
+	 */
+	size_t vasplit = countargs(fn->type);
+	size_t i = 0, sz = 0;
+	Type *t;
+	Node *arg;
+
+	for (i = 0; i < fn->nargs; i++) {
+		arg = fn->args[i];
+		t = decltype(arg);
+		sz = size(arg);
+
+		if (i < vasplit && isaggregate(t) && sz <= 16) {
+			PassIn pc[2] = { PassInNoPref, PassInNoPref };
+			classify(t, pc);
+			if (pc[0] != PassInMemory && pc[1] != PassInMemory) {
+				sz = align(sz, 8);
+				fn->stksz += sz;
+				fn->stksz = align(fn->stksz, min(sz, Ptrsz));
+				htput(fn->stkoff, fn->args[i], itop(fn->stksz));
+			}
+		}
+	}
+}
+
 void
 selfunc(Isel *is, Func *fn, Htab *globls, Htab *strtab)
 {
@@ -1132,6 +1328,7 @@ selfunc(Isel *is, Func *fn, Htab *globls, Htab *strtab)
 		g(is, Iloc, locstrlbl(buf), NULL);
 	}
 
+	handlesmallstructargs(is, fn);
 	prologue(is, fn, fn->stksz);
 	lastline = -1;
 	for (j = 0; j < fn->cfg->nbb - 1; j++) {
diff --git a/6/locs.c b/6/locs.c
index aa26f01e..b314cd9a 100644
--- a/6/locs.c
+++ b/6/locs.c
@@ -176,6 +176,8 @@ loclit(long val, Mode m)
 Loc *
 coreg(Reg r, Mode m)
 {
+	assert(m != ModeNone);
+
 	Reg crtab[][Nmode + 1] = {
 		[Ral]  = {Rnone, Ral,  Rax,  Reax, Rrax},
 		[Rcl]  = {Rnone, Rcl,  Rcx,  Recx, Rrcx},
-- 
2.26.2
| [PATCH 0/9] Handle small-aggregates via AMD64 abi | "S. Gilles" <sgilles@xxxxxxx> | 
- Prev by Date: [PATCH 4/9] Add classification algorithm for small-struct passing.
- Next by Date: [PATCH 7/9] Bump abi version.
- Previous by thread: [PATCH 4/9] Add classification algorithm for small-struct passing.
- Next by thread: [PATCH 7/9] Bump abi version.
- Index(es):