Convert fma, valltrue & vanytrue to ISLE (AArch64) (#4608)

* Convert `fma`, `valltrue` & `vanytrue` to ISLE (AArch64) Ported the existing implementations of the following opcodes to ISLE on AArch64: - `fma` - Introduced missing support for `fma` on vector values, as per the docs. - `valltrue` - `vanytrue` Also fixed `fcmp` on scalar values in the interpreter, and enabled interpreter tests in `simd-fma.clif`. This introduces the `FMLA` machine instruction. Copyright (c) 2022 Arm Limited * Add comments for `Fmla` and `Bsl` Copyright (c) 2022 Arm Limited
2022-08-05 17:47:56 +01:00
parent 1ed7b43e62
commit eb332b8369
19 changed files with 608 additions and 206 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -457,7 +457,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                }
                (true, false) => {
                    let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-                    let size = VectorSize::from_lane_size(ScalarSize::from_bits(oty_bits), true);
+                    let size = ScalarSize::from_bits(oty_bits);

                    ctx.emit(Inst::MovFromVec {
                        rd,
@@ -685,7 +685,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let ty = ty.unwrap();

                if ty_has_int_representation(ty) {
-                    ctx.emit(Inst::MovFromVec { rd, rn, idx, size });
+                    ctx.emit(Inst::MovFromVec {
+                        rd,
+                        rn,
+                        idx,
+                        size: size.lane_size(),
+                    });
                // Plain moves are faster on some processors.
                } else if idx == 0 {
                    ctx.emit(Inst::gen_move(rd, rn, ty));
@@ -729,115 +734,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::ScalarToVector => implemented_in_isle(ctx),

-        Opcode::VallTrue if ctx.input_ty(insn, 0).lane_bits() == 64 => {
-            let input_ty = ctx.input_ty(insn, 0);
-
-            if input_ty.lane_count() != 2 {
-                return Err(CodegenError::Unsupported(format!(
-                    "VallTrue: unsupported type {:?}",
-                    input_ty
-                )));
-            }
-
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let tmp = ctx.alloc_tmp(I64X2).only_reg().unwrap();
-
-            // cmeq vtmp.2d, vm.2d, #0
-            // addp dtmp, vtmp.2d
-            // fcmp dtmp, dtmp
-            // cset xd, eq
-            //
-            // Note that after the ADDP the value of the temporary register will
-            // be either 0 when all input elements are true, i.e. non-zero, or a
-            // NaN otherwise (either -1 or -2 when represented as an integer);
-            // NaNs are the only floating-point numbers that compare unequal to
-            // themselves.
-
-            ctx.emit(Inst::VecMisc {
-                op: VecMisc2::Cmeq0,
-                rd: tmp,
-                rn: rm,
-                size: VectorSize::Size64x2,
-            });
-            ctx.emit(Inst::VecRRPair {
-                op: VecPairOp::Addp,
-                rd: tmp,
-                rn: tmp.to_reg(),
-            });
-            ctx.emit(Inst::FpuCmp {
-                size: ScalarSize::Size64,
-                rn: tmp.to_reg(),
-                rm: tmp.to_reg(),
-            });
-            materialize_bool_result(ctx, insn, rd, Cond::Eq);
-        }
-
-        Opcode::VanyTrue | Opcode::VallTrue => {
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let src_ty = ctx.input_ty(insn, 0);
-            let tmp = ctx.alloc_tmp(src_ty).only_reg().unwrap();
-
-            // This operation is implemented by using umaxp or uminv to
-            // create a scalar value, which is then compared against zero.
-            //
-            // umaxp vn.16b, vm.16, vm.16 / uminv bn, vm.16b
-            // mov xm, vn.d[0]
-            // cmp xm, #0
-            // cset xm, ne
-
-            let s = VectorSize::from_ty(src_ty);
-            let size = if s == VectorSize::Size64x2 {
-                // `vall_true` with 64-bit elements is handled elsewhere.
-                debug_assert_ne!(op, Opcode::VallTrue);
-
-                VectorSize::Size32x4
-            } else {
-                s
-            };
-
-            if op == Opcode::VanyTrue {
-                ctx.emit(Inst::VecRRR {
-                    alu_op: VecALUOp::Umaxp,
-                    rd: tmp,
-                    rn: rm,
-                    rm,
-                    size,
-                });
-            } else {
-                if size == VectorSize::Size32x2 {
-                    return Err(CodegenError::Unsupported(format!(
-                        "VallTrue: Unsupported type: {:?}",
-                        src_ty
-                    )));
-                }
-
-                ctx.emit(Inst::VecLanes {
-                    op: VecLanesOp::Uminv,
-                    rd: tmp,
-                    rn: rm,
-                    size,
-                });
-            };
-
-            ctx.emit(Inst::MovFromVec {
-                rd,
-                rn: tmp.to_reg(),
-                idx: 0,
-                size: VectorSize::Size64x2,
-            });
-
-            ctx.emit(Inst::AluRRImm12 {
-                alu_op: ALUOp::SubS,
-                size: OperandSize::Size64,
-                rd: writable_zero_reg(),
-                rn: rd.to_reg(),
-                imm12: Imm12::zero(),
-            });
-
-            materialize_bool_result(ctx, insn, rd, Cond::Ne);
-        }
+        Opcode::VallTrue | Opcode::VanyTrue => implemented_in_isle(ctx),

        Opcode::VhighBits => {
            let dst_r = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
@@ -904,7 +801,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd: dst_r,
                        rn: tmp_v0.to_reg(),
                        idx: 0,
-                        size: VectorSize::Size16x8,
+                        size: ScalarSize::Size16,
                    });
                }
                I16X8 => {
@@ -962,7 +859,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd: dst_r,
                        rn: tmp_v0.to_reg(),
                        idx: 0,
-                        size: VectorSize::Size16x8,
+                        size: ScalarSize::Size16,
                    });
                }
                I32X4 => {
@@ -1018,7 +915,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd: dst_r,
                        rn: tmp_v0.to_reg(),
                        idx: 0,
-                        size: VectorSize::Size32x4,
+                        size: ScalarSize::Size32,
                    });
                }
                I64X2 => {
@@ -1031,13 +928,13 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        rd: dst_r,
                        rn: src_v,
                        idx: 0,
-                        size: VectorSize::Size64x2,
+                        size: ScalarSize::Size64,
                    });
                    ctx.emit(Inst::MovFromVec {
                        rd: tmp_r0,
                        rn: src_v,
                        idx: 1,
-                        size: VectorSize::Size64x2,
+                        size: ScalarSize::Size64,
                    });
                    ctx.emit(Inst::AluRRImmShift {
                        alu_op: ALUOp::Lsr,
@@ -1139,31 +1036,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

        Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => implemented_in_isle(ctx),

-        Opcode::Fma => {
-            let ty = ty.unwrap();
-            let bits = ty_bits(ty);
-            let fpu_op = match bits {
-                32 => FPUOp3::MAdd32,
-                64 => FPUOp3::MAdd64,
-                _ => {
-                    return Err(CodegenError::Unsupported(format!(
-                        "Fma: Unsupported type: {:?}",
-                        ty
-                    )))
-                }
-            };
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let ra = put_input_in_reg(ctx, inputs[2], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            ctx.emit(Inst::FpuRRRR {
-                fpu_op,
-                rn,
-                rm,
-                ra,
-                rd,
-            });
-        }
+        Opcode::Fma => implemented_in_isle(ctx),

        Opcode::Fcopysign => {
            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: