Cranelift AArch64: Implement scalar FmaxPseudo and FminPseudo

2021-09-14 22:05:26 +01:00
parent 144a0bfd83
commit 930b1f17f0
2 changed files with 44 additions and 24 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2926,42 +2926,62 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }
        Opcode::FminPseudo | Opcode::FmaxPseudo => {
-            let ty = ctx.input_ty(insn, 0);
+            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            if ty == F32X4 || ty == F64X2 {
+            let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
            let (ra, rb) = if op == Opcode::FminPseudo {
                (rm, rn)
            } else {
                (rn, rm)
            };
            let ty = ty.unwrap();
            let lane_type = ty.lane_type();
            debug_assert!(lane_type == F32 || lane_type == F64);
            if ty.is_vector() {
                let size = VectorSize::from_ty(ty);
                // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
                // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
-                let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+                // Since we're going to write the output register `rd` anyway, we might as well
-                let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                // first use it to hold the comparison result.  This has the slightly unusual
                let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
                // Since we're going to write the output register `r_dst` anyway, we might as
                // well first use it to hold the comparison result.  This has the slightly unusual
                // effect that we modify the output register in the first instruction (`fcmgt`)
                // but read both the inputs again in the second instruction (`bsl`), which means
                // that the output register can't be either of the input registers.  Regalloc
                // should handle this correctly, nevertheless.
                ctx.emit(Inst::VecRRR {
                    alu_op: VecALUOp::Fcmgt,
-                    rd: r_dst,
+                    rd,
-                    rn: if op == Opcode::FminPseudo { r_a } else { r_b },
+                    rn: ra,
-                    rm: if op == Opcode::FminPseudo { r_b } else { r_a },
+                    rm: rb,
-                    size: if ty == F32X4 {
+                    size,
                        VectorSize::Size32x4
                    } else {
                        VectorSize::Size64x2
                    },
                });
                ctx.emit(Inst::VecRRR {
                    alu_op: VecALUOp::Bsl,
-                    rd: r_dst,
+                    rd,
-                    rn: r_b,
+                    rn,
-                    rm: r_a,
+                    rm,
-                    size: VectorSize::Size8x16,
+                    size,
                });
            } else {
-                return Err(CodegenError::Unsupported(format!(
+                if lane_type == F32 {
-                    "{}: Unsupported type: {:?}",
+                    ctx.emit(Inst::FpuCmp32 { rn: ra, rm: rb });
-                    op, ty
+                    ctx.emit(Inst::FpuCSel32 {
-                )));
+                        rd,
                        rn,
                        rm,
                        cond: Cond::Gt,
                    });
                } else {
                    ctx.emit(Inst::FpuCmp64 { rn: ra, rm: rb });
                    ctx.emit(Inst::FpuCSel64 {
                        rd,
                        rn,
                        rm,
                        cond: Cond::Gt,
                    });
                }
            }
        }
--- a/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif
+++ b/cranelift/filetests/filetests/runtests/fmin-max-pseudo.clif
@@ -1,6 +1,6 @@
 test run
 ; target s390x TODO: Not yet implemented on s390x
-; target aarch64 TODO: Not yet implemented on aarch64
+target aarch64
 set enable_simd
 target x86_64 machinst skylake