Cranelift AArch64: Implement scalar FmaxPseudo and FminPseudo

Copyright (c) 2021, Arm Limited.
This commit is contained in:
Anton Kirilov
2021-09-14 22:05:26 +01:00
parent 144a0bfd83
commit 930b1f17f0
2 changed files with 44 additions and 24 deletions

View File

@@ -2926,42 +2926,62 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
} }
Opcode::FminPseudo | Opcode::FmaxPseudo => { Opcode::FminPseudo | Opcode::FmaxPseudo => {
let ty = ctx.input_ty(insn, 0); let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
if ty == F32X4 || ty == F64X2 { let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
let (ra, rb) = if op == Opcode::FminPseudo {
(rm, rn)
} else {
(rn, rm)
};
let ty = ty.unwrap();
let lane_type = ty.lane_type();
debug_assert!(lane_type == F32 || lane_type == F64);
if ty.is_vector() {
let size = VectorSize::from_ty(ty);
// pmin(a,b) => bitsel(b, a, cmpgt(a, b)) // pmin(a,b) => bitsel(b, a, cmpgt(a, b))
// pmax(a,b) => bitsel(b, a, cmpgt(b, a)) // pmax(a,b) => bitsel(b, a, cmpgt(b, a))
let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); // Since we're going to write the output register `rd` anyway, we might as well
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); // first use it to hold the comparison result. This has the slightly unusual
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
// Since we're going to write the output register `r_dst` anyway, we might as
// well first use it to hold the comparison result. This has the slightly unusual
// effect that we modify the output register in the first instruction (`fcmgt`) // effect that we modify the output register in the first instruction (`fcmgt`)
// but read both the inputs again in the second instruction (`bsl`), which means // but read both the inputs again in the second instruction (`bsl`), which means
// that the output register can't be either of the input registers. Regalloc // that the output register can't be either of the input registers. Regalloc
// should handle this correctly, nevertheless. // should handle this correctly, nevertheless.
ctx.emit(Inst::VecRRR { ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Fcmgt, alu_op: VecALUOp::Fcmgt,
rd: r_dst, rd,
rn: if op == Opcode::FminPseudo { r_a } else { r_b }, rn: ra,
rm: if op == Opcode::FminPseudo { r_b } else { r_a }, rm: rb,
size: if ty == F32X4 { size,
VectorSize::Size32x4
} else {
VectorSize::Size64x2
},
}); });
ctx.emit(Inst::VecRRR { ctx.emit(Inst::VecRRR {
alu_op: VecALUOp::Bsl, alu_op: VecALUOp::Bsl,
rd: r_dst, rd,
rn: r_b, rn,
rm: r_a, rm,
size: VectorSize::Size8x16, size,
}); });
} else { } else {
return Err(CodegenError::Unsupported(format!( if lane_type == F32 {
"{}: Unsupported type: {:?}", ctx.emit(Inst::FpuCmp32 { rn: ra, rm: rb });
op, ty ctx.emit(Inst::FpuCSel32 {
))); rd,
rn,
rm,
cond: Cond::Gt,
});
} else {
ctx.emit(Inst::FpuCmp64 { rn: ra, rm: rb });
ctx.emit(Inst::FpuCSel64 {
rd,
rn,
rm,
cond: Cond::Gt,
});
}
} }
} }

View File

@@ -1,6 +1,6 @@
test run test run
; target s390x TODO: Not yet implemented on s390x ; target s390x TODO: Not yet implemented on s390x
; target aarch64 TODO: Not yet implemented on aarch64 target aarch64
set enable_simd set enable_simd
target x86_64 machinst skylake target x86_64 machinst skylake