Address review comments.

2020-08-21 12:40:47 +02:00
parent ee76e01efc
commit 7c85654285
4 changed files with 300 additions and 263 deletions
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -43,7 +43,7 @@ pub fn has_side_effect(func: &Function, inst: Inst) -> bool {

 /// Does the given instruction have any side-effect as per [has_side_effect], or else is a load,
 /// but not the get_pinned_reg opcode?
-pub fn has_side_effect_or_load_not_get_pinned_reg(func: &Function, inst: Inst) -> bool {
+pub fn has_lowering_side_effect(func: &Function, inst: Inst) -> bool {
    let op = func.dfg[inst].opcode();
    op != Opcode::GetPinnedReg && (has_side_effect(func, inst) || op.can_load())
 }
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -360,6 +360,13 @@ pub enum Inst {
    JmpKnown { dst: BranchTarget },

    /// One-way conditional branch: jcond cond target.
+    ///
+    /// This instruction is useful when we have conditional jumps depending on more than two
+    /// conditions, see for instance the lowering of Brz/brnz with Fcmp inputs.
+    ///
+    /// A note of caution: in contexts where the branch target is another block, this has to be the
+    /// same successor as the one specified in the terminator branch of the current block.
+    /// Otherwise, this might confuse register allocation by creating new invisible edges.
    JmpIf { cc: CC, taken: BranchTarget },

    /// Two-way conditional branch: jcond cond target target.
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -124,29 +124,37 @@ struct InsnOutput {
    output: usize,
 }

+/// Returns whether the given specified `input` is a result produced by an instruction with Opcode
+/// `op`.
+// TODO investigate failures with checking against the result index.
 fn matches_input<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    input: InsnInput,
    op: Opcode,
 ) -> Option<IRInst> {
    let inputs = ctx.get_input(input.insn, input.input);
-    if let Some((src_inst, _)) = inputs.inst {
+    inputs.inst.and_then(|(src_inst, _)| {
        let data = ctx.data(src_inst);
        if data.opcode() == op {
            return Some(src_inst);
        }
-    }
-    None
+        None
+    })
+}
+
+fn lowerinput_to_reg(ctx: Ctx, input: LowerInput) -> Reg {
+    ctx.use_input_reg(input);
+    input.reg
 }

 /// Put the given input into a register, and mark it as used (side-effect).
 fn input_to_reg(ctx: Ctx, spec: InsnInput) -> Reg {
-    let inputs = ctx.get_input(spec.insn, spec.input);
-    ctx.use_input_reg(inputs);
-    inputs.reg
+    let input = ctx.get_input(spec.insn, spec.input);
+    lowerinput_to_reg(ctx, input)
 }

 /// An extension specification for `extend_input_to_reg`.
+#[derive(Clone, Copy)]
 enum ExtSpec {
    ZeroExtendTo32,
    ZeroExtendTo64,
@@ -163,6 +171,12 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
    };
    let input_size = ctx.input_ty(spec.insn, spec.input).bits();

+    let requested_ty = if requested_size == 32 {
+        types::I32
+    } else {
+        types::I64
+    };
+
    let ext_mode = match (input_size, requested_size) {
        (a, b) if a == b => return input_to_reg(ctx, spec),
        (a, 32) if a == 1 || a == 8 => ExtMode::BL,
@@ -173,12 +187,6 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
        _ => unreachable!(),
    };

-    let requested_ty = if requested_size == 32 {
-        types::I32
-    } else {
-        types::I64
-    };
-
    let src = input_to_reg_mem(ctx, spec);
    let dst = ctx.alloc_tmp(RegClass::I64, requested_ty);
    match ext_spec {
@@ -196,21 +204,26 @@ fn extend_input_to_reg(ctx: Ctx, spec: InsnInput, ext_spec: ExtSpec) -> Reg {
    dst.to_reg()
 }

+fn lowerinput_to_reg_mem(ctx: Ctx, input: LowerInput) -> RegMem {
+    // TODO handle memory.
+    RegMem::reg(lowerinput_to_reg(ctx, input))
+}
+
 /// Put the given input into a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem(ctx: Ctx, spec: InsnInput) -> RegMem {
-    // TODO handle memory.
-    RegMem::reg(input_to_reg(ctx, spec))
+    let input = ctx.get_input(spec.insn, spec.input);
+    lowerinput_to_reg_mem(ctx, input)
 }

 /// Returns whether the given input is an immediate that can be properly sign-extended, without any
 /// possible side-effect.
-fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
-    ctx.get_input(spec.insn, spec.input).constant.and_then(|x| {
+fn lowerinput_to_sext_imm(input: LowerInput, input_ty: Type) -> Option<u32> {
+    input.constant.and_then(|x| {
        // For i64 instructions (prefixed with REX.W), require that the immediate will sign-extend
        // to 64 bits. For other sizes, it doesn't matter and we can just use the plain
        // constant.
-        if ctx.input_ty(spec.insn, spec.input).bytes() != 8 || low32_will_sign_extend_to_64(x) {
+        if input_ty.bytes() != 8 || low32_will_sign_extend_to_64(x) {
            Some(x as u32)
        } else {
            None
@@ -218,6 +231,12 @@ fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
    })
 }

+fn input_to_sext_imm(ctx: Ctx, spec: InsnInput) -> Option<u32> {
+    let input = ctx.get_input(spec.insn, spec.input);
+    let input_ty = ctx.input_ty(spec.insn, spec.input);
+    lowerinput_to_sext_imm(input, input_ty)
+}
+
 fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
    ctx.get_input(spec.insn, spec.input).constant
 }
@@ -225,9 +244,11 @@ fn input_to_imm(ctx: Ctx, spec: InsnInput) -> Option<u64> {
 /// Put the given input into an immediate, a register or a memory operand.
 /// Effectful: may mark the given input as used, when returning the register form.
 fn input_to_reg_mem_imm(ctx: Ctx, spec: InsnInput) -> RegMemImm {
-    match input_to_sext_imm(ctx, spec) {
+    let input = ctx.get_input(spec.insn, spec.input);
+    let input_ty = ctx.input_ty(spec.insn, spec.input);
+    match lowerinput_to_sext_imm(input, input_ty) {
        Some(x) => RegMemImm::imm(x),
-        None => match input_to_reg_mem(ctx, spec) {
+        None => match lowerinput_to_reg_mem(ctx, input) {
            RegMem::Reg { reg } => RegMemImm::reg(reg),
            RegMem::Mem { addr } => RegMemImm::mem(addr),
        },
@@ -252,34 +273,88 @@ fn emit_cmp(ctx: Ctx, insn: IRInst) {
    ctx.emit(Inst::cmp_rmi_r(ty.bytes() as u8, rhs, lhs));
 }

-#[derive(PartialEq)]
-enum FcmpOperands {
-    Swap,
-    DontSwap,
+/// A specification for a fcmp emission.
+enum FcmpSpec {
+    /// Normal flow.
+    Normal,
+
+    /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
+    /// happens with `InvertedEqualOrConditions`.
+    ///
+    /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
+    /// sequence of instructions) that check for an "AND" combination of condition codes; see for
+    /// instance lowering of Select.
+    InvertEqual,
 }

-fn emit_fcmp(ctx: Ctx, insn: IRInst, swap_operands: FcmpOperands) {
+/// This explains how to interpret the results of an fcmp instruction.
+enum FcmpCondResult {
+    /// The given condition code must be set.
+    Condition(CC),
+
+    /// Both condition codes must be set.
+    AndConditions(CC, CC),
+
+    /// Either of the conditions codes must be set.
+    OrConditions(CC, CC),
+
+    /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
+    /// of the condition codes must be set, and the user must invert meaning of analyzing the
+    /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
+    /// reached.
+    InvertedEqualOrConditions(CC, CC),
+}
+
+fn emit_fcmp(ctx: Ctx, insn: IRInst, mut cond_code: FloatCC, spec: FcmpSpec) -> FcmpCondResult {
+    let (flip_operands, inverted_equal) = match cond_code {
+        FloatCC::LessThan
+        | FloatCC::LessThanOrEqual
+        | FloatCC::UnorderedOrGreaterThan
+        | FloatCC::UnorderedOrGreaterThanOrEqual => {
+            cond_code = cond_code.reverse();
+            (true, false)
+        }
+        FloatCC::Equal => {
+            let inverted_equal = match spec {
+                FcmpSpec::Normal => false,
+                FcmpSpec::InvertEqual => {
+                    cond_code = FloatCC::NotEqual; // same as .inverse()
+                    true
+                }
+            };
+            (false, inverted_equal)
+        }
+        _ => (false, false),
+    };
+
    // The only valid CC constructed with `from_floatcc` can be put in the flag
    // register with a direct float comparison; do this here.
-    let input_ty = ctx.input_ty(insn, 0);
-    let op = match input_ty {
+    let op = match ctx.input_ty(insn, 0) {
        types::F32 => SseOpcode::Ucomiss,
        types::F64 => SseOpcode::Ucomisd,
        _ => panic!("Bad input type to Fcmp"),
    };
+
    let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
-    let (lhs, rhs) = if swap_operands == FcmpOperands::Swap {
-        (
-            input_to_reg(ctx, inputs[1]),
-            input_to_reg_mem(ctx, inputs[0]),
-        )
+    let (lhs_input, rhs_input) = if flip_operands {
+        (inputs[1], inputs[0])
    } else {
-        (
-            input_to_reg(ctx, inputs[0]),
-            input_to_reg_mem(ctx, inputs[1]),
-        )
+        (inputs[0], inputs[1])
    };
+    let lhs = input_to_reg(ctx, lhs_input);
+    let rhs = input_to_reg_mem(ctx, rhs_input);
    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
+
+    let cond_result = match cond_code {
+        FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
+        FloatCC::NotEqual if inverted_equal => {
+            FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
+        }
+        FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
+        _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
+    };
+
+    cond_result
 }

 fn make_libcall_sig(ctx: Ctx, insn: IRInst, call_conv: CallConv, ptr_ty: Type) -> Signature {
@@ -350,33 +425,31 @@ fn emit_vm_call<C: LowerCtx<I = Inst>>(

 /// Returns whether the given input is a shift by a constant value less or equal than 3.
 /// The goal is to embed it within an address mode.
-fn matches_small_cst_shift<C: LowerCtx<I = Inst>>(
+fn matches_small_constant_shift<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    spec: InsnInput,
 ) -> Option<(InsnInput, u8)> {
-    if let Some(shift) = matches_input(ctx, spec, Opcode::Ishl) {
-        if let Some(shift_amt) = input_to_imm(
+    matches_input(ctx, spec, Opcode::Ishl).and_then(|shift| {
+        match input_to_imm(
            ctx,
            InsnInput {
                insn: shift,
                input: 1,
            },
        ) {
-            if shift_amt <= 3 {
-                return Some((
-                    InsnInput {
-                        insn: shift,
-                        input: 0,
-                    },
-                    shift_amt as u8,
-                ));
-            }
+            Some(shift_amt) if shift_amt <= 3 => Some((
+                InsnInput {
+                    insn: shift,
+                    input: 0,
+                },
+                shift_amt as u8,
+            )),
+            _ => None,
        }
-    }
-    None
+    })
 }

-fn lower_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: u32) -> Amode {
+fn lower_to_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: u32) -> Amode {
    // We now either have an add that we must materialize, or some other input; as well as the
    // final offset.
    if let Some(add) = matches_input(ctx, spec, Opcode::Iadd) {
@@ -394,14 +467,16 @@ fn lower_amode<C: LowerCtx<I = Inst>>(ctx: &mut C, spec: InsnInput, offset: u32)
        // TODO heap_addr legalization generates a uext64 *after* the shift, so these optimizations
        // aren't happening in the wasm case. We could do better, given some range analysis.
        let (base, index, shift) = if let Some((shift_input, shift_amt)) =
-            matches_small_cst_shift(ctx, add_inputs[0])
+            matches_small_constant_shift(ctx, add_inputs[0])
        {
            (
                input_to_reg(ctx, add_inputs[1]),
                input_to_reg(ctx, shift_input),
                shift_amt,
            )
-        } else if let Some((shift_input, shift_amt)) = matches_small_cst_shift(ctx, add_inputs[1]) {
+        } else if let Some((shift_input, shift_amt)) =
+            matches_small_constant_shift(ctx, add_inputs[1])
+        {
            (
                input_to_reg(ctx, add_inputs[0]),
                input_to_reg(ctx, shift_input),
@@ -1027,15 +1102,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        }

        Opcode::Fcmp => {
-            let condcode = inst_fp_condcode(ctx.data(insn));
+            let cond_code = inst_fp_condcode(ctx.data(insn));
            let input_ty = ctx.input_ty(insn, 0);
            if !input_ty.is_vector() {
-                let op = match input_ty {
-                    types::F32 => SseOpcode::Ucomiss,
-                    types::F64 => SseOpcode::Ucomisd,
-                    _ => panic!("Bad input type to fcmp: {}", input_ty),
-                };
-
                // Unordered is returned by setting ZF, PF, CF <- 111
                // Greater than by ZF, PF, CF <- 000
                // Less than by ZF, PF, CF <- 001
@@ -1051,71 +1120,35 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                // set, then both the ZF and CF flag bits must also be set we can get away with using
                // one setcc for most condition codes.

-                match condcode {
-                    FloatCC::LessThan
-                    | FloatCC::LessThanOrEqual
-                    | FloatCC::UnorderedOrGreaterThan
-                    | FloatCC::UnorderedOrGreaterThanOrEqual => {
-                        // setb and setbe for ordered LessThan and LessThanOrEqual check if CF = 1
-                        // which doesn't exclude unorderdness. To get around this we can reverse the
-                        // operands and the cc test to instead check if CF and ZF are 0 which would
-                        // also excludes unorderedness. Using similiar logic we also reverse
-                        // UnorderedOrGreaterThan and UnorderedOrGreaterThanOrEqual and assure that ZF
-                        // or CF is 1 to exclude orderedness.
-                        let lhs = input_to_reg_mem(ctx, inputs[0]);
-                        let rhs = input_to_reg(ctx, inputs[1]);
-                        let dst = output_to_reg(ctx, outputs[0]);
-                        ctx.emit(Inst::xmm_cmp_rm_r(op, lhs, rhs));
-                        let condcode = condcode.reverse();
-                        let cc = CC::from_floatcc(condcode);
+                let dst = output_to_reg(ctx, outputs[0]);
+
+                match emit_fcmp(ctx, insn, cond_code, FcmpSpec::Normal) {
+                    FcmpCondResult::Condition(cc) => {
                        ctx.emit(Inst::setcc(cc, dst));
                    }
-
-                    FloatCC::Equal => {
-                        // Outlier case: equal means both the operands are ordered and equal; we cannot
-                        // get around checking the parity bit to determine if the result was ordered.
-                        let lhs = input_to_reg(ctx, inputs[0]);
-                        let rhs = input_to_reg_mem(ctx, inputs[1]);
-                        let dst = output_to_reg(ctx, outputs[0]);
-                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I32);
-                        ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-                        ctx.emit(Inst::setcc(CC::NP, tmp_gpr1));
-                        ctx.emit(Inst::setcc(CC::Z, dst));
+                    FcmpCondResult::AndConditions(cc1, cc2) => {
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, dst));
                        ctx.emit(Inst::alu_rmi_r(
                            false,
                            AluRmiROpcode::And,
-                            RegMemImm::reg(tmp_gpr1.to_reg()),
+                            RegMemImm::reg(tmp.to_reg()),
                            dst,
                        ));
                    }
-
-                    FloatCC::NotEqual => {
-                        // Outlier case: not equal means either the operands are unordered, or they're
-                        // not the same value.
-                        let lhs = input_to_reg(ctx, inputs[0]);
-                        let rhs = input_to_reg_mem(ctx, inputs[1]);
-                        let dst = output_to_reg(ctx, outputs[0]);
-                        let tmp_gpr1 = ctx.alloc_tmp(RegClass::I64, types::I32);
-                        ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-                        ctx.emit(Inst::setcc(CC::P, tmp_gpr1));
-                        ctx.emit(Inst::setcc(CC::NZ, dst));
+                    FcmpCondResult::OrConditions(cc1, cc2) => {
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, dst));
                        ctx.emit(Inst::alu_rmi_r(
                            false,
                            AluRmiROpcode::Or,
-                            RegMemImm::reg(tmp_gpr1.to_reg()),
+                            RegMemImm::reg(tmp.to_reg()),
                            dst,
                        ));
                    }
-
-                    _ => {
-                        // For all remaining condition codes we can handle things with one check.
-                        let lhs = input_to_reg(ctx, inputs[0]);
-                        let rhs = input_to_reg_mem(ctx, inputs[1]);
-                        let dst = output_to_reg(ctx, outputs[0]);
-                        let cc = CC::from_floatcc(condcode);
-                        ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-                        ctx.emit(Inst::setcc(cc, dst));
-                    }
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                }
            } else {
                let op = match input_ty {
@@ -1126,7 +1159,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(

                // Since some packed comparisons are not available, some of the condition codes
                // must be inverted, with a corresponding `flip` of the operands.
-                let (imm, flip) = match condcode {
+                let (imm, flip) = match cond_code {
                    FloatCC::GreaterThan => (FcmpImm::LessThan, true),
                    FloatCC::GreaterThanOrEqual => (FcmpImm::LessThanOrEqual, true),
                    FloatCC::UnorderedOrLessThan => (FcmpImm::UnorderedOrGreaterThan, true),
@@ -1134,9 +1167,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                        (FcmpImm::UnorderedOrGreaterThanOrEqual, true)
                    }
                    FloatCC::OrderedNotEqual | FloatCC::UnorderedOrEqual => {
-                        panic!("unsupported float condition code: {}", condcode)
+                        panic!("unsupported float condition code: {}", cond_code)
                    }
-                    _ => (FcmpImm::from(condcode), false),
+                    _ => (FcmpImm::from(cond_code), false),
                };

                // Determine the operands of the comparison, possibly by flipping them.
@@ -1225,35 +1258,77 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let srcloc = ctx.srcloc(insn);
            let trap_code = inst_trapcode(ctx.data(insn)).unwrap();

-            let cc = if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
-                let condcode = inst_condcode(ctx.data(insn));
+            if matches_input(ctx, inputs[0], Opcode::IaddIfcout).is_some() {
+                let cond_code = inst_condcode(ctx.data(insn));
                // The flags must not have been clobbered by any other instruction between the
                // iadd_ifcout and this instruction, as verified by the CLIF validator; so we can
                // simply use the flags here.
-                CC::from_intcc(condcode)
+                let cc = CC::from_intcc(cond_code);
+
+                ctx.emit_safepoint(Inst::TrapIf {
+                    trap_code,
+                    srcloc,
+                    cc,
+                });
            } else if op == Opcode::Trapif {
-                let condcode = inst_condcode(ctx.data(insn));
-                let cc = CC::from_intcc(condcode);
+                let cond_code = inst_condcode(ctx.data(insn));
+                let cc = CC::from_intcc(cond_code);

                // Verification ensures that the input is always a single-def ifcmp.
-                let ifcmp_insn = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
-                emit_cmp(ctx, ifcmp_insn);
-                cc
+                let ifcmp = matches_input(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+                emit_cmp(ctx, ifcmp);
+
+                ctx.emit_safepoint(Inst::TrapIf {
+                    trap_code,
+                    srcloc,
+                    cc,
+                });
            } else {
-                let condcode = inst_fp_condcode(ctx.data(insn));
-                let cc = CC::from_floatcc(condcode);
+                let cond_code = inst_fp_condcode(ctx.data(insn));

                // Verification ensures that the input is always a single-def ffcmp.
-                let ffcmp_insn = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();
-                emit_fcmp(ctx, ffcmp_insn, FcmpOperands::DontSwap);
-                cc
-            };
+                let ffcmp = matches_input(ctx, inputs[0], Opcode::Ffcmp).unwrap();

-            ctx.emit_safepoint(Inst::TrapIf {
-                trap_code,
-                srcloc,
-                cc,
-            });
+                match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
+                    FcmpCondResult::Condition(cc) => ctx.emit_safepoint(Inst::TrapIf {
+                        trap_code,
+                        srcloc,
+                        cc,
+                    }),
+                    FcmpCondResult::AndConditions(cc1, cc2) => {
+                        // A bit unfortunate, but materialize the flags in their own register, and
+                        // check against this.
+                        let tmp = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        let tmp2 = ctx.alloc_tmp(RegClass::I64, types::I32);
+                        ctx.emit(Inst::setcc(cc1, tmp));
+                        ctx.emit(Inst::setcc(cc2, tmp2));
+                        ctx.emit(Inst::alu_rmi_r(
+                            false, /* is_64 */
+                            AluRmiROpcode::And,
+                            RegMemImm::reg(tmp.to_reg()),
+                            tmp2,
+                        ));
+                        ctx.emit_safepoint(Inst::TrapIf {
+                            trap_code,
+                            srcloc,
+                            cc: CC::NZ,
+                        });
+                    }
+                    FcmpCondResult::OrConditions(cc1, cc2) => {
+                        ctx.emit_safepoint(Inst::TrapIf {
+                            trap_code,
+                            srcloc,
+                            cc: cc1,
+                        });
+                        ctx.emit_safepoint(Inst::TrapIf {
+                            trap_code,
+                            srcloc,
+                            cc: cc2,
+                        });
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
+                };
+            };
        }

        Opcode::F64const => {
@@ -1751,7 +1826,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                | Opcode::Uload32
                | Opcode::Sload32 => {
                    assert_eq!(inputs.len(), 1, "only one input for load operands");
-                    lower_amode(ctx, inputs[0], offset as u32)
+                    lower_to_amode(ctx, inputs[0], offset as u32)
                }

                Opcode::LoadComplex
@@ -1842,7 +1917,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let addr = match op {
                Opcode::Store | Opcode::Istore8 | Opcode::Istore16 | Opcode::Istore32 => {
                    assert_eq!(inputs.len(), 2, "only one input for store memory operands");
-                    lower_amode(ctx, inputs[1], offset as u32)
+                    lower_to_amode(ctx, inputs[1], offset as u32)
                }

                Opcode::StoreComplex
@@ -1899,11 +1974,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                None
            };
+
            // Make sure that both args are in virtual regs, since in effect we have to do a
            // parallel copy to get them safely to the AtomicRmwSeq input regs, and that's not
            // guaranteed safe if either is in a real reg.
            addr = ctx.ensure_in_vreg(addr, types::I64);
            arg2 = ctx.ensure_in_vreg(arg2, types::I64);
+
            // Move the args to the preordained AtomicRMW input regs.  Note that `AtomicRmwSeq`
            // operates at whatever width is specified by `ty`, so there's no need to
            // zero-extend `arg2` in the case of `ty` being I8/I16/I32.
@@ -1917,6 +1994,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                arg2,
                types::I64,
            ));
+
            // Now the AtomicRmwSeq (pseudo-) instruction itself
            let op = inst_common::AtomicRmwOp::from(inst_atomic_rmw_op(ctx.data(insn)).unwrap());
            ctx.emit(Inst::AtomicRmwSeq {
@@ -1924,6 +2002,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                op,
                srcloc,
            });
+
            // And finally, copy the preordained AtomicRmwSeq output reg to its destination.
            ctx.emit(Inst::gen_move(dst, regs::rax(), types::I64));
        }
@@ -1932,7 +2011,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            // This is very similar to, but not identical to, the `AtomicRmw` case.  As with
            // `AtomicRmw`, there's no need to zero-extend narrow values here.
            let dst = output_to_reg(ctx, outputs[0]);
-            let addr = input_to_reg(ctx, inputs[0]);
+            let addr = lower_to_amode(ctx, inputs[0], 0);
            let expected = input_to_reg(ctx, inputs[1]);
            let replacement = input_to_reg(ctx, inputs[2]);
            let ty_access = ty.unwrap();
@@ -1943,6 +2022,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                None
            };
+
            // Move the expected value into %rax.  Because there's only one fixed register on
            // the input side, we don't have to use `ensure_in_vreg`, as is necessary in the
            // `AtomicRmw` case.
@@ -1954,7 +2034,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            ctx.emit(Inst::LockCmpxchg {
                ty: ty_access,
                src: replacement,
-                dst: Amode::imm_reg(0, addr).into(),
+                dst: addr.into(),
                srcloc,
            });
            // And finally, copy the old value at the location to its destination reg.
@@ -1966,7 +2046,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            // to satisfy the CLIF synchronisation requirements for `AtomicLoad` without the
            // need for any fence instructions.
            let data = output_to_reg(ctx, outputs[0]);
-            let addr = input_to_reg(ctx, inputs[0]);
+            let addr = lower_to_amode(ctx, inputs[0], 0);
            let ty_access = ty.unwrap();
            assert!(is_valid_atomic_transaction_ty(ty_access));
            let memflags = ctx.memflags(insn).expect("memory flags");
@@ -1975,8 +2055,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                None
            };
-            // For the amode, we could do better, but for now just use `0(addr)`.
-            let rm = RegMem::mem(Amode::imm_reg(0, addr));
+
+            let rm = RegMem::mem(addr);
            if ty_access == types::I64 {
                ctx.emit(Inst::mov64_rm_r(rm, data, srcloc));
            } else {
@@ -1993,7 +2073,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
        Opcode::AtomicStore => {
            // This is a normal store, followed by an `mfence` instruction.
            let data = input_to_reg(ctx, inputs[0]);
-            let addr = input_to_reg(ctx, inputs[1]);
+            let addr = lower_to_amode(ctx, inputs[1], 0);
            let ty_access = ctx.input_ty(insn, 0);
            assert!(is_valid_atomic_transaction_ty(ty_access));
            let memflags = ctx.memflags(insn).expect("memory flags");
@@ -2002,13 +2082,8 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            } else {
                None
            };
-            // For the amode, we could do better, but for now just use `0(addr)`.
-            ctx.emit(Inst::mov_r_m(
-                ty_access.bytes() as u8,
-                data,
-                Amode::imm_reg(0, addr),
-                srcloc,
-            ));
+
+            ctx.emit(Inst::mov_r_m(ty_access.bytes() as u8, data, addr, srcloc));
            ctx.emit(Inst::Fence {
                kind: FenceKind::MFence,
            });
@@ -2068,81 +2143,36 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
                let cond_code = inst_fp_condcode(ctx.data(fcmp));

-                // See comments in the lowering of Fcmp.
-                let (cond_code, swap_op, was_equal) = match cond_code {
-                    FloatCC::LessThan
-                    | FloatCC::LessThanOrEqual
-                    | FloatCC::UnorderedOrGreaterThan
-                    | FloatCC::UnorderedOrGreaterThanOrEqual => {
-                        (cond_code.reverse(), FcmpOperands::Swap, false)
-                    }
-                    FloatCC::Equal => {
-                        // Additionally, we invert Equal to NotEqual too: taking LHS if equal would
-                        // mean take it if both CC::NP and CC::Z are set, the conjunction of which
-                        // can't be modeled with a single cmov instruction. Instead, we'll swap LHS
-                        // and RHS in the select operation, and invert the equal to a not-equal
-                        // here.
-                        (FloatCC::NotEqual, FcmpOperands::DontSwap, true)
-                    }
-                    _ => (cond_code, FcmpOperands::DontSwap, false),
-                };
-                emit_fcmp(ctx, fcmp, swap_op);
+                // we request inversion of Equal to NotEqual here: taking LHS if equal would mean
+                // take it if both CC::NP and CC::Z are set, the conjunction of which can't be
+                // modeled with a single cmov instruction. Instead, we'll swap LHS and RHS in the
+                // select operation, and invert the equal to a not-equal here.
+                let fcmp_results = emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::InvertEqual);

-                let (lhs, rhs) = if was_equal {
-                    // See comment above about inverting conditional code.
-                    (
-                        input_to_reg_mem(ctx, inputs[2]),
-                        input_to_reg(ctx, inputs[1]),
-                    )
-                } else {
-                    (
-                        input_to_reg_mem(ctx, inputs[1]),
-                        input_to_reg(ctx, inputs[2]),
-                    )
+                let (lhs_input, rhs_input) = match fcmp_results {
+                    FcmpCondResult::InvertedEqualOrConditions(_, _) => (inputs[2], inputs[1]),
+                    FcmpCondResult::Condition(_)
+                    | FcmpCondResult::AndConditions(_, _)
+                    | FcmpCondResult::OrConditions(_, _) => (inputs[1], inputs[2]),
                };

-                let dst = output_to_reg(ctx, outputs[0]);
-
                let ty = ctx.output_ty(insn, 0);
-
-                let lhs = if is_int_ty(ty) {
-                    let size = ty.bytes() as u8;
-                    if size == 1 {
-                        // Sign-extend operands to 32, then do a cmove of size 4.
-                        let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32);
-                        ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None));
-                        ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None));
-                        RegMem::reg(lhs_se.to_reg())
-                    } else {
-                        ctx.emit(Inst::gen_move(dst, rhs, ty));
-                        lhs
-                    }
+                let rhs = input_to_reg(ctx, rhs_input);
+                let dst = output_to_reg(ctx, outputs[0]);
+                let lhs = if is_int_ty(ty) && ty.bytes() < 4 {
+                    // Special case: since the higher bits are undefined per CLIF semantics, we
+                    // can just apply a 32-bit cmove here. Force inputs into registers, to
+                    // avoid partial spilling out-of-bounds with memory accesses, though.
+                    // Sign-extend operands to 32, then do a cmove of size 4.
+                    RegMem::reg(input_to_reg(ctx, lhs_input))
                } else {
-                    debug_assert!(ty == types::F32 || ty == types::F64);
-                    ctx.emit(Inst::gen_move(dst, rhs, ty));
-                    lhs
+                    input_to_reg_mem(ctx, lhs_input)
                };

-                match cond_code {
-                    FloatCC::Equal => {
-                        // See comment above about inverting conditional code.
-                        panic!("can't happen because of above guard");
-                    }
+                ctx.emit(Inst::gen_move(dst, rhs, ty));

-                    FloatCC::NotEqual => {
-                        // Take lhs if not-equal, that is CC::P or CC:NZ.
-                        if is_int_ty(ty) {
-                            let size = u8::max(ty.bytes() as u8, 4);
-                            ctx.emit(Inst::cmove(size, CC::P, lhs.clone(), dst));
-                            ctx.emit(Inst::cmove(size, CC::NZ, lhs, dst));
-                        } else {
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, CC::P, lhs.clone(), dst));
-                            ctx.emit(Inst::xmm_cmove(ty == types::F64, CC::NZ, lhs, dst));
-                        }
-                    }
-
-                    _ => {
-                        let cc = CC::from_floatcc(cond_code);
+                match fcmp_results {
+                    FcmpCondResult::Condition(cc) => {
                        if is_int_ty(ty) {
                            let size = u8::max(ty.bytes() as u8, 4);
                            ctx.emit(Inst::cmove(size, cc, lhs, dst));
@@ -2150,6 +2180,22 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
                        }
                    }
+                    FcmpCondResult::AndConditions(_, _) => {
+                        unreachable!(
+                            "can't AND with select; see above comment about inverting equal"
+                        );
+                    }
+                    FcmpCondResult::InvertedEqualOrConditions(cc1, cc2)
+                    | FcmpCondResult::OrConditions(cc1, cc2) => {
+                        if is_int_ty(ty) {
+                            let size = u8::max(ty.bytes() as u8, 4);
+                            ctx.emit(Inst::cmove(size, cc1, lhs.clone(), dst));
+                            ctx.emit(Inst::cmove(size, cc2, lhs, dst));
+                        } else {
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc1, lhs.clone(), dst));
+                            ctx.emit(Inst::xmm_cmove(ty == types::F64, cc2, lhs, dst));
+                        }
+                    }
                }
            } else {
                let cc = if let Some(icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
@@ -2164,27 +2210,27 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    CC::NZ
                };

-                let lhs = input_to_reg_mem(ctx, inputs[1]);
                let rhs = input_to_reg(ctx, inputs[2]);
                let dst = output_to_reg(ctx, outputs[0]);
-
                let ty = ctx.output_ty(insn, 0);

+                ctx.emit(Inst::gen_move(dst, rhs, ty));
+
                if is_int_ty(ty) {
-                    let size = ty.bytes() as u8;
-                    if size == 1 {
-                        // Sign-extend operands to 32, then do a cmove of size 4.
-                        let lhs_se = ctx.alloc_tmp(RegClass::I64, types::I32);
-                        ctx.emit(Inst::movsx_rm_r(ExtMode::BL, lhs, lhs_se, None));
-                        ctx.emit(Inst::movsx_rm_r(ExtMode::BL, RegMem::reg(rhs), dst, None));
-                        ctx.emit(Inst::cmove(4, cc, RegMem::reg(lhs_se.to_reg()), dst));
+                    let mut size = ty.bytes() as u8;
+                    let lhs = if size < 4 {
+                        // Special case: since the higher bits are undefined per CLIF semantics, we
+                        // can just apply a 32-bit cmove here. Force inputs into registers, to
+                        // avoid partial spilling out-of-bounds with memory accesses, though.
+                        size = 4;
+                        RegMem::reg(input_to_reg(ctx, inputs[1]))
                    } else {
-                        ctx.emit(Inst::gen_move(dst, rhs, ty));
-                        ctx.emit(Inst::cmove(size, cc, lhs, dst));
-                    }
+                        input_to_reg_mem(ctx, inputs[1])
+                    };
+                    ctx.emit(Inst::cmove(size, cc, lhs, dst));
                } else {
                    debug_assert!(ty == types::F32 || ty == types::F64);
-                    ctx.emit(Inst::gen_move(dst, rhs, ty));
+                    let lhs = input_to_reg_mem(ctx, inputs[1]);
                    ctx.emit(Inst::xmm_cmove(ty == types::F64, cc, lhs, dst));
                }
            }
@@ -2464,47 +2510,29 @@ impl LowerBackend for X64Backend {
                        } else {
                            cond_code
                        };
+
                        let cc = CC::from_intcc(cond_code);
                        ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                    } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
                        let cond_code = inst_fp_condcode(ctx.data(fcmp));
-
                        let cond_code = if op0 == Opcode::Brz {
                            cond_code.inverse()
                        } else {
                            cond_code
                        };
-
-                        // See comments in the lowering of Fcmp.
-                        let (cond_code, swap_op) = match cond_code {
-                            FloatCC::LessThan
-                            | FloatCC::LessThanOrEqual
-                            | FloatCC::UnorderedOrGreaterThan
-                            | FloatCC::UnorderedOrGreaterThanOrEqual => {
-                                (cond_code.reverse(), FcmpOperands::Swap)
-                            }
-                            _ => (cond_code, FcmpOperands::DontSwap),
-                        };
-                        emit_fcmp(ctx, fcmp, swap_op);
-
-                        match cond_code {
-                            FloatCC::Equal => {
-                                // Jump to taken if CC::NP and CC::Z, that is, jump to not-taken if
-                                // CC::P or CC::NZ.
-                                ctx.emit(Inst::jmp_if(CC::P, not_taken));
-                                ctx.emit(Inst::jmp_cond(CC::NZ, not_taken, taken));
-                            }
-
-                            FloatCC::NotEqual => {
-                                // Jump to taken if CC::P or CC::NZ.
-                                ctx.emit(Inst::jmp_if(CC::P, taken));
-                                ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
-                            }
-
-                            _ => {
-                                let cc = CC::from_floatcc(cond_code);
+                        match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
+                            FcmpCondResult::Condition(cc) => {
                                ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
                            }
+                            FcmpCondResult::AndConditions(cc1, cc2) => {
+                                ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
+                                ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
+                            }
+                            FcmpCondResult::OrConditions(cc1, cc2) => {
+                                ctx.emit(Inst::jmp_if(cc1, taken));
+                                ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
+                            }
+                            FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
                        }
                    } else if is_int_ty(src_ty) || is_bool_ty(src_ty) {
                        let src = input_to_reg(
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -4,7 +4,7 @@

 use crate::entity::SecondaryMap;
 use crate::fx::{FxHashMap, FxHashSet};
-use crate::inst_predicates::{has_side_effect_or_load_not_get_pinned_reg, is_constant_64bit};
+use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit};
 use crate::ir::instructions::BranchInfo;
 use crate::ir::types::I64;
 use crate::ir::{
@@ -372,7 +372,7 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
        for bb in f.layout.blocks() {
            cur_color += 1;
            for inst in f.layout.block_insts(bb) {
-                let side_effect = has_side_effect_or_load_not_get_pinned_reg(f, inst);
+                let side_effect = has_lowering_side_effect(f, inst);

                // Assign colors. A new color is chosen *after* any side-effecting instruction.
                inst_colors[inst] = InstColor::new(cur_color);
@@ -799,15 +799,15 @@ impl<'func, I: VCodeInst> Lower<'func, I> {
            ValueDef::Result(src_inst, result_idx) => {
                debug!(" -> src inst {}", src_inst);
                debug!(
-                    " -> has side effect: {}",
-                    has_side_effect_or_load_not_get_pinned_reg(self.f, src_inst)
+                    " -> has lowering side effect: {}",
+                    has_lowering_side_effect(self.f, src_inst)
                );
                debug!(
                    " -> our color is {:?}, src inst is {:?}",
                    self.inst_color(at_inst),
                    self.inst_color(src_inst)
                );
-                if !has_side_effect_or_load_not_get_pinned_reg(self.f, src_inst)
+                if !has_lowering_side_effect(self.f, src_inst)
                    || self.inst_color(at_inst) == self.inst_color(src_inst)
                {
                    Some((src_inst, result_idx))
@@ -989,6 +989,8 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> {

    fn use_input_reg(&mut self, input: LowerInput) {
        debug!("use_input_reg: vreg {:?} is needed", input.reg);
+        // We may directly return a real (machine) register when we know that register holds the
+        // result of an opcode (e.g. GetPinnedReg).
        if input.reg.is_virtual() {
            self.vreg_needed[input.reg.get_index()] = true;
        }