From e43310a088aceaa46fbdc810e15a85f6dee10806 Mon Sep 17 00:00:00 2001 From: Benjamin Bouvier Date: Wed, 22 Jul 2020 16:46:12 +0200 Subject: [PATCH] machinst x64: adapt conversions for saturation behaviors; --- cranelift/codegen/src/isa/x64/inst/emit.rs | 295 ++++++++++++++------- cranelift/codegen/src/isa/x64/inst/mod.rs | 31 +-- cranelift/codegen/src/isa/x64/lower.rs | 16 +- 3 files changed, 223 insertions(+), 119 deletions(-) diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 09b0957622..1fe72b4fff 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1672,7 +1672,7 @@ pub(crate) fn emit( // mov %src, %tmp_gpr2 // and $1, %tmp_gpr2 // or %tmp_gpr1, %tmp_gpr2 - // ctsi2sd/cvtsi2ss %tmp_gpr2, %dst + // cvtsi2sd/cvtsi2ss %tmp_gpr2, %dst // addsd/addss %dst, %dst // // done: @@ -1769,25 +1769,45 @@ pub(crate) fn emit( Inst::CvtFloatToSintSeq { src_size, dst_size, + is_saturating, src, dst, tmp_gpr, tmp_xmm, srcloc, } => { - // Emits the following sequence: + // Emits the following common sequence: // // cvttss2si/cvttsd2si %src, %dst - // cmp $INT_MIN, %dst ;; 2 instructions (movaps + reg cmp) for 64-bits ints - // jnz done + // cmp %dst, 1 + // jno done + // + // Then, for saturating conversions: // // ;; check for NaN // cmpss/cmpsd %src, %src - // jnp check_if_correct + // jnp not_nan + // xor %dst, %dst + // + // ;; positive inputs get saturated to INT_MAX; negative ones to INT_MIN, which is + // ;; already in %dst. + // mov 0, %tmp_gpr + // movd/movq %tmp_gpr, %tmp_xmm + // cmpss/cmpsd %src, %tmp_xmm + // jnb done + // mov/movaps $INT_MAX, %dst + // + // done: + // + // Then, for non-saturating conversions: + // + // ;; check for NaN + // cmpss/cmpsd %src, %src + // jnp not_nan // ud2 trap BadConversionToInteger // // ;; check if INT_MIN was the correct result, against a magic constant: - // check_if_correct: + // not_nan: // movaps/mov $magic, %tmp_gpr // movq/movd %tmp_gpr, %tmp_xmm // cmpss/cmpsd %tmp_xmm, %src @@ -1812,112 +1832,147 @@ pub(crate) fn emit( }; let done = sink.get_label(); + let not_nan = sink.get_label(); + // The truncation. let inst = Inst::xmm_to_gpr(trunc_op, src, *dst, *dst_size); inst.emit(sink, flags, state); - // Generate constant INT_MIN, and compare against it. - if *dst_size == OperandSize::Size64 { - let inst = Inst::imm_r(true, 0x8000000000000000, *tmp_gpr); - inst.emit(sink, flags, state); + // Compare against 1, in case of overflow the dst operand was INT_MIN. + let inst = Inst::cmp_rmi_r(dst_size.to_bytes(), RegMemImm::imm(1), dst.to_reg()); + inst.emit(sink, flags, state); - let inst = Inst::cmp_rmi_r(8, RegMemImm::reg(tmp_gpr.to_reg()), dst.to_reg()); - inst.emit(sink, flags, state); - } else { - // Emit a simple comparison. - let inst = Inst::cmp_rmi_r(4, RegMemImm::imm(0x80000000), dst.to_reg()); - inst.emit(sink, flags, state); - } - - one_way_jmp(sink, CC::NZ, done); // == (int) + one_way_jmp(sink, CC::NO, done); // no overflow => done // Check for NaN. let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), src); inst.emit(sink, flags, state); - let check_if_correct = sink.get_label(); - one_way_jmp(sink, CC::NP, check_if_correct); // jump over trap if not a NaN + one_way_jmp(sink, CC::NP, not_nan); // go to not_nan if not a NaN - let inst = Inst::trap(*srcloc, TrapCode::BadConversionToInteger); - inst.emit(sink, flags, state); + if *is_saturating { + // For NaN, emit 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, flags, state); - // Check if INT_MIN was the correct result: determine the smallest floating point - // number that would convert to INT_MIN, put it in a temporary register, and compare - // against the src register. - // If the src register is less (or in some cases, less-or-equal) than the threshold, - // trap! + let inst = Inst::jmp_known(BranchTarget::Label(done)); + inst.emit(sink, flags, state); - sink.bind_label(check_if_correct); + sink.bind_label(not_nan); - let mut no_overflow_cc = CC::NB; // >= - let output_bits = dst_size.to_bits(); - match *src_size { - OperandSize::Size32 => { - let cst = Ieee32::pow2(output_bits - 1).neg().bits(); - let inst = Inst::imm32_r_unchecked(cst as u64, *tmp_gpr); + // If the input was positive, saturate to INT_MAX. + // TODO use xorps/xorpd here + let inst = Inst::imm_r(false, 0, *tmp_gpr); // rely on sign-extension to get 0 on 64-bits + inst.emit(sink, flags, state); + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, flags, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); + inst.emit(sink, flags, state); + + // Jump if >= to done. + one_way_jmp(sink, CC::NB, done); + + // Otherwise, put INT_MAX. + if *dst_size == OperandSize::Size64 { + let inst = Inst::imm_r(true, 0x7fffffffffffffff, *dst); + inst.emit(sink, flags, state); + } else { + let inst = Inst::imm_r(false, 0x7fffffff, *dst); inst.emit(sink, flags, state); } - OperandSize::Size64 => { - // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, so - // there are values less than -2^(N-1) that convert correctly to INT_MIN. - let cst = if output_bits < 64 { - no_overflow_cc = CC::NBE; // > - Ieee64::fcvt_to_sint_negative_overflow(output_bits) - } else { - Ieee64::pow2(output_bits - 1).neg() - }; - let inst = Inst::imm_r(true, cst.bits(), *tmp_gpr); - inst.emit(sink, flags, state); + } else { + let check_positive = sink.get_label(); + + let inst = Inst::trap(*srcloc, TrapCode::BadConversionToInteger); + inst.emit(sink, flags, state); + + // Check if INT_MIN was the correct result: determine the smallest floating point + // number that would convert to INT_MIN, put it in a temporary register, and compare + // against the src register. + // If the src register is less (or in some cases, less-or-equal) than the threshold, + // trap! + + sink.bind_label(not_nan); + + let mut no_overflow_cc = CC::NB; // >= + let output_bits = dst_size.to_bits(); + match *src_size { + OperandSize::Size32 => { + let cst = Ieee32::pow2(output_bits - 1).neg().bits(); + let inst = Inst::imm32_r_unchecked(cst as u64, *tmp_gpr); + inst.emit(sink, flags, state); + } + OperandSize::Size64 => { + // An f64 can represent `i32::min_value() - 1` exactly with precision to spare, + // so there are values less than -2^(N-1) that convert correctly to INT_MIN. + let cst = if output_bits < 64 { + no_overflow_cc = CC::NBE; // > + Ieee64::fcvt_to_sint_negative_overflow(output_bits) + } else { + Ieee64::pow2(output_bits - 1).neg() + }; + let inst = Inst::imm_r(true, cst.bits(), *tmp_gpr); + inst.emit(sink, flags, state); + } } + + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, flags, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src); + inst.emit(sink, flags, state); + + // jump over trap if src >= or > threshold + one_way_jmp(sink, no_overflow_cc, check_positive); + + let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); + inst.emit(sink, flags, state); + + // If positive, it was a real overflow. + + sink.bind_label(check_positive); + + // TODO use xorpd + let inst = Inst::imm_r(false, 0, *tmp_gpr); + inst.emit(sink, flags, state); + + let inst = + Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); + inst.emit(sink, flags, state); + + let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); + inst.emit(sink, flags, state); + + one_way_jmp(sink, CC::NB, done); // jump over trap if 0 >= src + + let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); + inst.emit(sink, flags, state); } - let inst = - Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); - inst.emit(sink, flags, state); - - let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(tmp_xmm.to_reg()), src); - inst.emit(sink, flags, state); - - let check_positive = sink.get_label(); - one_way_jmp(sink, no_overflow_cc, check_positive); // jump over trap if src >= or > threshold - - let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); - inst.emit(sink, flags, state); - - // If positive, it was a real overflow. - - sink.bind_label(check_positive); - - // TODO use xorpd - let inst = Inst::imm_r(false, 0, *tmp_gpr); - inst.emit(sink, flags, state); - - let inst = - Inst::gpr_to_xmm(cast_op, RegMem::reg(tmp_gpr.to_reg()), *src_size, *tmp_xmm); - inst.emit(sink, flags, state); - - let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg()); - inst.emit(sink, flags, state); - - one_way_jmp(sink, CC::NB, done); // jump over trap if 0 >= src - - let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); - inst.emit(sink, flags, state); - sink.bind_label(done); } Inst::CvtFloatToUintSeq { src_size, dst_size, + is_saturating, src, dst, tmp_gpr, tmp_xmm, srcloc, } => { - // Emits the following sequence: + // The only difference in behavior between saturating and non-saturating is how we + // handle errors. Emits the following sequence: // // movaps/mov 2**(int_width - 1), %tmp_gpr // movq/movd %tmp_gpr, %tmp_xmm @@ -1925,21 +1980,24 @@ pub(crate) fn emit( // jnb is_large // // ;; check for NaN inputs - // jnp next - // ud2 trap BadConversionToInteger + // jnp not_nan + // -- non-saturating: ud2 trap BadConversionToInteger + // -- saturating: xor %dst, %dst; j done // - // next: + // not_nan: // cvttss2si/cvttsd2si %src, %dst // cmp 0, %dst // jnl done - // ud2 trap IntegerOverflow + // -- non-saturating: ud2 trap IntegerOverflow + // -- saturating: xor %dst, %dst; j done // // is_large: // subss/subsd %tmp_xmm, %src ; <-- we clobber %src here // cvttss2si/cvttss2sd %tmp_x, %dst // cmp 0, %dst // jnl next_is_large - // ud2 trap IntegerOverflow + // -- non-saturating: ud2 trap IntegerOverflow + // -- saturating: movaps $UINT_MAX, %dst; j done // // next_is_large: // add 2**(int_width -1), %dst ;; 2 instructions for 64-bits integers @@ -1986,13 +2044,28 @@ pub(crate) fn emit( let handle_large = sink.get_label(); one_way_jmp(sink, CC::NB, handle_large); // jump to handle_large if src >= large_threshold - let next = sink.get_label(); - one_way_jmp(sink, CC::NP, next); // jump over trap if not NaN + let not_nan = sink.get_label(); + one_way_jmp(sink, CC::NP, not_nan); // jump over trap if not NaN - let inst = Inst::trap(*srcloc, TrapCode::BadConversionToInteger); - inst.emit(sink, flags, state); + if *is_saturating { + // Emit 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, flags, state); - sink.bind_label(next); + let inst = Inst::jmp_known(BranchTarget::Label(done)); + inst.emit(sink, flags, state); + } else { + // Trap. + let inst = Inst::trap(*srcloc, TrapCode::BadConversionToInteger); + inst.emit(sink, flags, state); + } + + sink.bind_label(not_nan); // Actual truncation for small inputs: if the result is not positive, then we had an // overflow. @@ -2005,8 +2078,24 @@ pub(crate) fn emit( one_way_jmp(sink, CC::NL, done); // if dst >= 0, jump to done - let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); - inst.emit(sink, flags, state); + if *is_saturating { + // The input was "small" (< 2**(width -1)), so the only way to get an integer + // overflow is because the input was too small: saturate to the min value, i.e. 0. + let inst = Inst::alu_rmi_r( + *dst_size == OperandSize::Size64, + AluRmiROpcode::Xor, + RegMemImm::reg(dst.to_reg()), + *dst, + ); + inst.emit(sink, flags, state); + + let inst = Inst::jmp_known(BranchTarget::Label(done)); + inst.emit(sink, flags, state); + } else { + // Trap. + let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); + inst.emit(sink, flags, state); + } // Now handle large inputs. @@ -2024,8 +2113,26 @@ pub(crate) fn emit( let next_is_large = sink.get_label(); one_way_jmp(sink, CC::NL, next_is_large); // if dst >= 0, jump to next_is_large - let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); - inst.emit(sink, flags, state); + if *is_saturating { + // The input was "large" (>= 2**(width -1)), so the only way to get an integer + // overflow is because the input was too large: saturate to the max value. + let inst = Inst::imm_r( + true, + if *dst_size == OperandSize::Size64 { + u64::max_value() + } else { + u32::max_value() as u64 + }, + *dst, + ); + inst.emit(sink, flags, state); + + let inst = Inst::jmp_known(BranchTarget::Label(done)); + inst.emit(sink, flags, state); + } else { + let inst = Inst::trap(*srcloc, TrapCode::IntegerOverflow); + inst.emit(sink, flags, state); + } sink.bind_label(next_is_large); diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs index 044fb0bbf5..fc04a58fc1 100644 --- a/cranelift/codegen/src/isa/x64/inst/mod.rs +++ b/cranelift/codegen/src/isa/x64/inst/mod.rs @@ -250,6 +250,7 @@ pub enum Inst { CvtFloatToSintSeq { dst_size: OperandSize, src_size: OperandSize, + is_saturating: bool, /// A copy of the source register, fed by lowering. It is marked as modified during /// register allocation to make sure that the temporary xmm register differs from the src /// register, since both registers are live at the same time in the generated code @@ -265,6 +266,7 @@ pub enum Inst { CvtFloatToUintSeq { src_size: OperandSize, dst_size: OperandSize, + is_saturating: bool, /// A copy of the source register, fed by lowering, reused as a temporary. It is marked as /// modified during register allocation to make sure that the temporary xmm register /// differs from the src register, since both registers are live at the same time in the @@ -578,10 +580,11 @@ impl Inst { pub(crate) fn cvt_float_to_sint_seq( src_size: OperandSize, dst_size: OperandSize, + is_saturating: bool, src: Writable, dst: Writable, - tmp_xmm: Writable, tmp_gpr: Writable, + tmp_xmm: Writable, srcloc: SourceLoc, ) -> Inst { debug_assert!(src.to_reg().get_class() == RegClass::V128); @@ -589,10 +592,11 @@ impl Inst { debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64); debug_assert!(dst.to_reg().get_class() == RegClass::I64); Inst::CvtFloatToSintSeq { - src, - dst, src_size, dst_size, + is_saturating, + src, + dst, tmp_gpr, tmp_xmm, srcloc, @@ -602,6 +606,7 @@ impl Inst { pub(crate) fn cvt_float_to_uint_seq( src_size: OperandSize, dst_size: OperandSize, + is_saturating: bool, src: Writable, dst: Writable, tmp_gpr: Writable, @@ -610,12 +615,14 @@ impl Inst { ) -> Inst { debug_assert!(src.to_reg().get_class() == RegClass::V128); debug_assert!(tmp_xmm.to_reg().get_class() == RegClass::V128); + debug_assert!(tmp_gpr.to_reg().get_class() == RegClass::I64); debug_assert!(dst.to_reg().get_class() == RegClass::I64); Inst::CvtFloatToUintSeq { - src, - dst, src_size, dst_size, + is_saturating, + src, + dst, tmp_gpr, tmp_xmm, srcloc, @@ -1363,13 +1370,8 @@ fn x64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { tmp_xmm, tmp_gpr, .. - } => { - collector.add_mod(*src); - collector.add_def(*dst); - collector.add_def(*tmp_xmm); - collector.add_def(*tmp_gpr); } - Inst::CvtFloatToUintSeq { + | Inst::CvtFloatToUintSeq { src, dst, tmp_gpr, @@ -1633,13 +1635,8 @@ fn x64_map_regs(inst: &mut Inst, mapper: &RUM) { ref mut tmp_xmm, ref mut tmp_gpr, .. - } => { - map_mod(mapper, src); - map_def(mapper, dst); - map_def(mapper, tmp_xmm); - map_def(mapper, tmp_gpr); } - Inst::CvtFloatToUintSeq { + | Inst::CvtFloatToUintSeq { ref mut src, ref mut dst, ref mut tmp_gpr, diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 4e8d1e5906..acbd073986 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -1016,7 +1016,7 @@ fn lower_insn_to_regs>( }; } - Opcode::FcvtToUint | Opcode::FcvtToSint => { + Opcode::FcvtToUint | Opcode::FcvtToUintSat | Opcode::FcvtToSint | Opcode::FcvtToSintSat => { let src = input_to_reg(ctx, inputs[0]); let dst = output_to_reg(ctx, outputs[0]); @@ -1036,23 +1036,23 @@ fn lower_insn_to_regs>( OperandSize::Size64 }; - let to_signed = op == Opcode::FcvtToSint; + let to_signed = op == Opcode::FcvtToSint || op == Opcode::FcvtToSintSat; + let is_sat = op == Opcode::FcvtToUintSat || op == Opcode::FcvtToSintSat; let src_copy = ctx.alloc_tmp(RegClass::V128, input_ty); ctx.emit(Inst::gen_move(src_copy, src, input_ty)); + let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty); + let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty); + let srcloc = ctx.srcloc(insn); if to_signed { - let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty); - let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty); ctx.emit(Inst::cvt_float_to_sint_seq( - src_size, dst_size, src_copy, dst, tmp_xmm, tmp_gpr, srcloc, + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, srcloc, )); } else { - let tmp_xmm = ctx.alloc_tmp(RegClass::V128, input_ty); - let tmp_gpr = ctx.alloc_tmp(RegClass::I64, output_ty); ctx.emit(Inst::cvt_float_to_uint_seq( - src_size, dst_size, src_copy, dst, tmp_gpr, tmp_xmm, srcloc, + src_size, dst_size, is_sat, src_copy, dst, tmp_gpr, tmp_xmm, srcloc, )); } }