Refactor AArch64 ABI support to extract common bits for shared impl with x64.

We have observed that the ABI implementations for AArch64 and x64 are very similar; in fact, x64's implementation started as a modified copy of AArch64's implementation. This is an artifact of both a similar ABI (both machines pass args and return values in registers first, then the stack, and both machines give considerable freedom with stack-frame layout) and a too-low-level ABI abstraction in the existing design. For machines that fit the mainstream or most common ABI-design idioms, we should be able to do much better. This commit factors AArch64 into machine-specific and machine-independent parts, but does not yet modify x64; that will come next. This should be completely neutral with respect to compile time and generated code performance.
2020-08-12 20:31:35 -07:00
parent 38ef98700f
commit 5cf3fba3da
10 changed files with 2039 additions and 1693 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1010,7 +1010,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                | Opcode::Sload32Complex => true,
                _ => false,
            };
-            let is_float = ty_is_float(elem_ty);
+            let is_float = ty_has_float_or_vec_representation(elem_ty);

            let mem = lower_address(ctx, elem_ty, &inputs[..], off);
            let rd = get_output_reg(ctx, outputs[0]);
@@ -1074,7 +1074,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
                _ => unreachable!(),
            };
-            let is_float = ty_is_float(elem_ty);
+            let is_float = ty_has_float_or_vec_representation(elem_ty);

            let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
            let rd = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
@@ -1291,9 +1291,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
            let ty = ctx.output_ty(insn, 0);
            let bits = ty_bits(ty);
-            if ty_is_float(ty) && bits == 32 {
+            let is_float = ty_has_float_or_vec_representation(ty);
+            if is_float && bits == 32 {
                ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
-            } else if ty_is_float(ty) && bits == 64 {
+            } else if is_float && bits == 64 {
                ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
            } else {
                ctx.emit(Inst::CSel { cond, rd, rn, rm });
@@ -1315,9 +1316,10 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rm = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
            let ty = ctx.output_ty(insn, 0);
            let bits = ty_bits(ty);
-            if ty_is_float(ty) && bits == 32 {
+            let is_float = ty_has_float_or_vec_representation(ty);
+            if is_float && bits == 32 {
                ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
-            } else if ty_is_float(ty) && bits == 64 {
+            } else if is_float && bits == 64 {
                ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
            } else {
                ctx.emit(Inst::CSel { cond, rd, rn, rm });
@@ -1521,7 +1523,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rd = get_output_reg(ctx, outputs[0]);
            let ity = ctx.input_ty(insn, 0);
            let oty = ctx.output_ty(insn, 0);
-            match (ty_is_float(ity), ty_is_float(oty)) {
+            let ity_vec_reg = ty_has_float_or_vec_representation(ity);
+            let oty_vec_reg = ty_has_float_or_vec_representation(oty);
+            match (ity_vec_reg, oty_vec_reg) {
                (true, true) => {
                    let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 {
                        NarrowValueMode::ZeroExtend32
@@ -1809,7 +1813,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let size = VectorSize::from_ty(ctx.input_ty(insn, 0));
                let ty = ty.unwrap();

-                if ty_is_int(ty) {
+                if ty_has_int_representation(ty) {
                    ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
                // Plain moves are faster on some processors.
                } else if idx == 0 {
@@ -1837,7 +1841,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

            ctx.emit(Inst::gen_move(rd, rm, ty));

-            if ty_is_int(input_ty) {
+            if ty_has_int_representation(input_ty) {
                ctx.emit(Inst::MovToVec { rd, rn, idx, size });
            } else {
                ctx.emit(Inst::VecMovElement {
@@ -1855,7 +1859,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let rd = get_output_reg(ctx, outputs[0]);
            let input_ty = ctx.input_ty(insn, 0);
            let size = VectorSize::from_ty(ty.unwrap());
-            let inst = if ty_is_int(input_ty) {
+            let inst = if ty_has_int_representation(input_ty) {
                Inst::VecDup { rd, rn, size }
            } else {
                Inst::VecDupFromFpu { rd, rn, size }