Cranelift AArch64: Implement scalar FmaxPseudo and FminPseudo
Copyright (c) 2021, Arm Limited.
This commit is contained in:
@@ -2926,42 +2926,62 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
|
||||
}
|
||||
|
||||
Opcode::FminPseudo | Opcode::FmaxPseudo => {
|
||||
let ty = ctx.input_ty(insn, 0);
|
||||
if ty == F32X4 || ty == F64X2 {
|
||||
let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let rm = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let rn = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
let (ra, rb) = if op == Opcode::FminPseudo {
|
||||
(rm, rn)
|
||||
} else {
|
||||
(rn, rm)
|
||||
};
|
||||
let ty = ty.unwrap();
|
||||
let lane_type = ty.lane_type();
|
||||
|
||||
debug_assert!(lane_type == F32 || lane_type == F64);
|
||||
|
||||
if ty.is_vector() {
|
||||
let size = VectorSize::from_ty(ty);
|
||||
|
||||
// pmin(a,b) => bitsel(b, a, cmpgt(a, b))
|
||||
// pmax(a,b) => bitsel(b, a, cmpgt(b, a))
|
||||
let r_dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
|
||||
let r_a = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
|
||||
let r_b = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
|
||||
// Since we're going to write the output register `r_dst` anyway, we might as
|
||||
// well first use it to hold the comparison result. This has the slightly unusual
|
||||
// Since we're going to write the output register `rd` anyway, we might as well
|
||||
// first use it to hold the comparison result. This has the slightly unusual
|
||||
// effect that we modify the output register in the first instruction (`fcmgt`)
|
||||
// but read both the inputs again in the second instruction (`bsl`), which means
|
||||
// that the output register can't be either of the input registers. Regalloc
|
||||
// should handle this correctly, nevertheless.
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Fcmgt,
|
||||
rd: r_dst,
|
||||
rn: if op == Opcode::FminPseudo { r_a } else { r_b },
|
||||
rm: if op == Opcode::FminPseudo { r_b } else { r_a },
|
||||
size: if ty == F32X4 {
|
||||
VectorSize::Size32x4
|
||||
} else {
|
||||
VectorSize::Size64x2
|
||||
},
|
||||
rd,
|
||||
rn: ra,
|
||||
rm: rb,
|
||||
size,
|
||||
});
|
||||
ctx.emit(Inst::VecRRR {
|
||||
alu_op: VecALUOp::Bsl,
|
||||
rd: r_dst,
|
||||
rn: r_b,
|
||||
rm: r_a,
|
||||
size: VectorSize::Size8x16,
|
||||
rd,
|
||||
rn,
|
||||
rm,
|
||||
size,
|
||||
});
|
||||
} else {
|
||||
return Err(CodegenError::Unsupported(format!(
|
||||
"{}: Unsupported type: {:?}",
|
||||
op, ty
|
||||
)));
|
||||
if lane_type == F32 {
|
||||
ctx.emit(Inst::FpuCmp32 { rn: ra, rm: rb });
|
||||
ctx.emit(Inst::FpuCSel32 {
|
||||
rd,
|
||||
rn,
|
||||
rm,
|
||||
cond: Cond::Gt,
|
||||
});
|
||||
} else {
|
||||
ctx.emit(Inst::FpuCmp64 { rn: ra, rm: rb });
|
||||
ctx.emit(Inst::FpuCSel64 {
|
||||
rd,
|
||||
rn,
|
||||
rm,
|
||||
cond: Cond::Gt,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
test run
|
||||
; target s390x TODO: Not yet implemented on s390x
|
||||
; target aarch64 TODO: Not yet implemented on aarch64
|
||||
target aarch64
|
||||
set enable_simd
|
||||
target x86_64 machinst skylake
|
||||
|
||||
|
||||
Reference in New Issue
Block a user