From 03b9e1e86a26a788faa4fca627971d20d2a3c353 Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Tue, 21 Jul 2020 14:27:44 +0200 Subject: [PATCH] machinst x64: implement float min/max with the right semantics; --- cranelift/codegen/src/isa/x64/inst/args.rs | 6 ++ cranelift/codegen/src/isa/x64/inst/emit.rs | 88 ++++++++++++++++++++++ cranelift/codegen/src/isa/x64/inst/mod.rs | 59 +++++++++++++++ cranelift/codegen/src/isa/x64/lower.rs | 15 ++++ 4 files changed, 168 insertions(+) diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index 2387af0ec6..713b783f30 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -337,6 +337,7 @@ pub enum SseOpcode { Addss, Addsd, Andps, + Andpd, Andnps, Comiss, Comisd, @@ -365,6 +366,7 @@ pub enum SseOpcode { Mulss, Mulsd, Orps, + Orpd, Rcpss, Roundss, Roundsd, @@ -404,6 +406,7 @@ impl SseOpcode { | SseOpcode::Cmpss => SSE, SseOpcode::Addsd + | SseOpcode::Andpd | SseOpcode::Cvtsd2ss | SseOpcode::Cvtsd2si | SseOpcode::Cvtsi2sd @@ -416,6 +419,7 @@ impl SseOpcode { | SseOpcode::Movq | SseOpcode::Movsd | SseOpcode::Mulsd + | SseOpcode::Orpd | SseOpcode::Sqrtsd | SseOpcode::Subsd | SseOpcode::Ucomisd @@ -440,6 +444,7 @@ impl fmt::Debug for SseOpcode { let name = match self { SseOpcode::Addss => "addss", SseOpcode::Addsd => "addsd", + SseOpcode::Andpd => "andpd", SseOpcode::Andps => "andps", SseOpcode::Andnps => "andnps", SseOpcode::Comiss => "comiss", @@ -465,6 +470,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Movsd => "movsd", SseOpcode::Mulss => "mulss", SseOpcode::Mulsd => "mulsd", + SseOpcode::Orpd => "orpd", SseOpcode::Orps => "orps", SseOpcode::Rcpss => "rcpss", SseOpcode::Roundss => "roundss", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 1fe72b4fff..c91e9f81c1 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1530,10 +1530,12 @@ pub(crate) fn emit( let (prefix, opcode) = match op { SseOpcode::Addss => (LegacyPrefix::_F3, 0x0F58), SseOpcode::Addsd => (LegacyPrefix::_F2, 0x0F58), + SseOpcode::Andpd => (LegacyPrefix::_66, 0x0F54), SseOpcode::Andps => (LegacyPrefix::None, 0x0F54), SseOpcode::Andnps => (LegacyPrefix::None, 0x0F55), SseOpcode::Mulss => (LegacyPrefix::_F3, 0x0F59), SseOpcode::Mulsd => (LegacyPrefix::_F2, 0x0F59), + SseOpcode::Orpd => (LegacyPrefix::_66, 0x0F56), SseOpcode::Orps => (LegacyPrefix::None, 0x0F56), SseOpcode::Subss => (LegacyPrefix::_F3, 0x0F5C), SseOpcode::Subsd => (LegacyPrefix::_F2, 0x0F5C), @@ -1557,6 +1559,92 @@ pub(crate) fn emit( } } + Inst::XmmMinMaxSeq { + size, + is_min, + lhs, + rhs_dst, + } => { + // Generates the following sequence: + // cmpss/cmpsd %lhs, %rhs_dst + // jnz do_min_max + // jp propagate_nan + // + // ;; ordered and equal: propagate the sign bit (for -0 vs 0): + // {and,or}{ss,sd} %lhs, %rhs_dst + // j done + // + // ;; to get the desired NaN behavior (signalling NaN transformed into a quiet NaN, the + // NaN value is returned), we add both inputs. + // propagate_nan: + // add{ss,sd} %lhs, %rhs_dst + // j done + // + // do_min_max: + // min{ss,sd} %lhs, %rhs_dst + // + // done: + let done = sink.get_label(); + let propagate_nan = sink.get_label(); + let do_min_max = sink.get_label(); + + let (add_op, cmp_op, and_op, or_op, min_max_op) = match size { + OperandSize::Size32 => ( + SseOpcode::Addss, + SseOpcode::Ucomiss, + SseOpcode::Andps, + SseOpcode::Orps, + if *is_min { + SseOpcode::Minss + } else { + SseOpcode::Maxss + }, + ), + OperandSize::Size64 => ( + SseOpcode::Addsd, + SseOpcode::Ucomisd, + SseOpcode::Andpd, + SseOpcode::Orpd, + if *is_min { + SseOpcode::Minsd + } else { + SseOpcode::Maxsd + }, + ), + }; + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(*lhs), rhs_dst.to_reg()); + inst.emit(sink, flags, state); + + one_way_jmp(sink, CC::NZ, do_min_max); + one_way_jmp(sink, CC::P, propagate_nan); + + // Ordered and equal. The operands are bit-identical unless they are zero + // and negative zero. These instructions merge the sign bits in that + // case, and are no-ops otherwise. + let op = if *is_min { or_op } else { and_op }; + let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, flags, state); + + let inst = Inst::jmp_known(BranchTarget::Label(done)); + inst.emit(sink, flags, state); + + // x86's min/max are not symmetric; if either operand is a NaN, they return the + // read-only operand: perform an addition between the two operands, which has the + // desired NaN propagation effects. + sink.bind_label(propagate_nan); + let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, flags, state); + + one_way_jmp(sink, CC::P, done); + + sink.bind_label(do_min_max); + let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst); + inst.emit(sink, flags, state); + + sink.bind_label(done); + } + Inst::Xmm_Mov_R_M { op, src, diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index fc04a58fc1..ffae1d1443 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -278,6 +278,14 @@ pub enum Inst { srcloc: SourceLoc, }, + /// A sequence to compute min/max with the proper NaN semantics for xmm registers. + XmmMinMaxSeq { + size: OperandSize, + is_min: bool, + lhs: Reg, + rhs_dst: Writable, + }, + /// XMM (scalar) conditional move. /// Overwrites the destination register if cc is set. XmmCmove { @@ -629,6 +637,22 @@ impl Inst { } } + pub(crate) fn xmm_min_max_seq( + size: OperandSize, + is_min: bool, + lhs: Reg, + rhs_dst: Writable, + ) -> Inst { + debug_assert_eq!(lhs.get_class(), RegClass::V128); + debug_assert_eq!(rhs_dst.to_reg().get_class(), RegClass::V128); + Inst::XmmMinMaxSeq { + size, + is_min, + lhs, + rhs_dst, + } + } + pub(crate) fn movzx_rm_r( ext_mode: ExtMode, src: RegMem, @@ -980,6 +1004,29 @@ impl ShowWithRRU for Inst { show_ireg_sized(dst.to_reg(), mb_rru, 8), ), + Inst::XmmMinMaxSeq { + lhs, + rhs_dst, + is_min, + size, + } => format!( + "{} {}, {}", + ljustify2( + if *is_min { + "xmm min seq ".to_string() + } else { + "xmm max seq ".to_string() + }, + match size { + OperandSize::Size32 => "f32", + OperandSize::Size64 => "f64", + } + .into() + ), + show_ireg_sized(*lhs, mb_rru, 8), + show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8), + ), + Inst::XmmToGpr { op, src, @@ -1333,6 +1380,10 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { src.get_regs_as_uses(collector); collector.add_mod(*dst); } + Inst::XmmMinMaxSeq { lhs, rhs_dst, .. } => { + collector.add_use(*lhs); + collector.add_mod(*rhs_dst); + } Inst::Xmm_Mov_R_M { src, dst, .. } => { collector.add_use(*src); dst.get_regs_as_uses(collector); @@ -1579,6 +1630,14 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { src.map_uses(mapper); map_mod(mapper, dst); } + Inst::XmmMinMaxSeq { + ref mut lhs, + ref mut rhs_dst, + .. + } => { + map_use(mapper, lhs); + map_mod(mapper, rhs_dst); + } Inst::Xmm_Mov_R_M { ref mut src, ref mut dst, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index acbd073986..1cd8ed224b 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -924,6 +924,21 @@ fn lower_insn_to_regs>( ctx.emit(Inst::xmm_rm_r(sse_op, RegMem::reg(rhs), dst)); } + Opcode::Fmin | Opcode::Fmax => { + let lhs = input_to_reg(ctx, inputs[0]); + let rhs = input_to_reg(ctx, inputs[1]); + let dst = output_to_reg(ctx, outputs[0]); + let is_min = op == Opcode::Fmin; + let output_ty = ty.unwrap(); + ctx.emit(Inst::gen_move(dst, rhs, output_ty)); + let op_size = match output_ty { + F32 => OperandSize::Size32, + F64 => OperandSize::Size64, + _ => panic!("unexpected type {:?} for fmin/fmax", output_ty), + }; + ctx.emit(Inst::xmm_min_max_seq(op_size, is_min, lhs, dst)); + } + Opcode::Sqrt => { let src = input_to_reg_mem(ctx, inputs[0]); let dst = output_to_reg(ctx, outputs[0]);