x64: Migrate brff and I128 branching instructions to ISLE (#4599)

https://github.com/bytecodealliance/wasmtime/pull/4599
2022-08-04 08:58:50 -07:00
parent 12a9705fbc
commit 1fc11bbe51
12 changed files with 254 additions and 356 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -548,9 +548,7 @@
            And
            Or
            Xor
-            Mul
-            And8
-            Or8))
+            Mul))

 (type UnaryRmROpcode extern
      (enum Bsr
@@ -1074,6 +1072,13 @@
 (decl cc_invert (CC) CC)
 (extern constructor cc_invert cc_invert)

+(decl floatcc_inverse (FloatCC) FloatCC)
+(extern constructor floatcc_inverse floatcc_inverse)
+
+;; Fails if the argument is not either CC.NZ or CC.Z.
+(decl cc_nz_or_z (CC) CC)
+(extern extractor cc_nz_or_z cc_nz_or_z)
+
 (type AvxOpcode extern
      (enum Vfmadd213ps
            Vfmadd213pd))
@@ -3060,6 +3065,10 @@
 (rule (jmp_known target)
      (SideEffectNoResult.Inst (MInst.JmpKnown target)))

+(decl jmp_if (CC MachLabel) ConsumesFlags)
+(rule (jmp_if cc taken)
+      (ConsumesFlags.ConsumesFlagsSideEffect (MInst.JmpIf cc taken)))
+
 ;; Conditional jump based on the condition code.
 (decl jmp_cond (CC MachLabel MachLabel) ConsumesFlags)
 (rule (jmp_cond cc taken not_taken)
@@ -3070,6 +3079,21 @@
 (rule (jmp_cond_icmp (IcmpCondResult.Condition producer cc) taken not_taken)
      (with_flags_side_effect producer (jmp_cond cc taken not_taken)))

+;; Conditional jump based on the result of an fcmp.
+(decl jmp_cond_fcmp (FcmpCondResult MachLabel MachLabel) SideEffectNoResult)
+(rule (jmp_cond_fcmp (FcmpCondResult.Condition producer cc) taken not_taken)
+      (with_flags_side_effect producer (jmp_cond cc taken not_taken)))
+(rule (jmp_cond_fcmp (FcmpCondResult.AndCondition producer cc1 cc2) taken not_taken)
+      (with_flags_side_effect producer
+                              (consumes_flags_concat
+                                (jmp_if (cc_invert cc1) not_taken)
+                                (jmp_cond (cc_invert cc2) not_taken taken))))
+(rule (jmp_cond_fcmp (FcmpCondResult.OrCondition producer cc1 cc2) taken not_taken)
+      (with_flags_side_effect producer
+                              (consumes_flags_concat
+                                (jmp_if cc1 taken)
+                                (jmp_cond cc2 taken not_taken))))
+
 ;;;; Comparisons ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (type IcmpCondResult (enum (Condition (producer ProducesFlags) (cc CC))))
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -701,12 +701,6 @@ pub enum AluRmiROpcode {
    Xor,
    /// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
    Mul,
-    /// 8-bit form of And. Handled separately as we don't have full 8-bit op
-    /// support (we just use wider instructions). Used only with some sequences
-    /// with SETcc.
-    And8,
-    /// 8-bit form of Or.
-    Or8,
 }

 impl fmt::Debug for AluRmiROpcode {
@@ -720,8 +714,6 @@ impl fmt::Debug for AluRmiROpcode {
            AluRmiROpcode::Or => "or",
            AluRmiROpcode::Xor => "xor",
            AluRmiROpcode::Mul => "imul",
-            AluRmiROpcode::And8 => "and",
-            AluRmiROpcode::Or8 => "or",
        };
        write!(fmt, "{}", name)
    }
@@ -733,16 +725,6 @@ impl fmt::Display for AluRmiROpcode {
    }
 }

-impl AluRmiROpcode {
-    /// Is this a special-cased 8-bit ALU op?
-    pub fn is_8bit(self) -> bool {
-        match self {
-            AluRmiROpcode::And8 | AluRmiROpcode::Or8 => true,
-            _ => false,
-        }
-    }
-}
-
 #[derive(Clone, PartialEq)]
 pub enum UnaryRmROpcode {
    /// Bit-scan reverse.
@@ -1704,32 +1686,6 @@ impl CC {
        }
    }

-    pub(crate) fn from_floatcc(floatcc: FloatCC) -> Self {
-        match floatcc {
-            FloatCC::Ordered => CC::NP,
-            FloatCC::Unordered => CC::P,
-            // Alias for NE
-            FloatCC::OrderedNotEqual => CC::NZ,
-            // Alias for E
-            FloatCC::UnorderedOrEqual => CC::Z,
-            // Alias for A
-            FloatCC::GreaterThan => CC::NBE,
-            // Alias for AE
-            FloatCC::GreaterThanOrEqual => CC::NB,
-            FloatCC::UnorderedOrLessThan => CC::B,
-            FloatCC::UnorderedOrLessThanOrEqual => CC::BE,
-            FloatCC::Equal
-            | FloatCC::NotEqual
-            | FloatCC::LessThan
-            | FloatCC::LessThanOrEqual
-            | FloatCC::UnorderedOrGreaterThan
-            | FloatCC::UnorderedOrGreaterThanOrEqual => panic!(
-                "{:?} can't be lowered to a CC code; treat as special case.",
-                floatcc
-            ),
-        }
-    }
-
    pub(crate) fn get_enc(self) -> u8 {
        self as u8
    }
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -158,7 +158,7 @@ pub(crate) fn emit(
                (reg_g, src2)
            };

-            let mut rex = RexFlags::from(*size);
+            let rex = RexFlags::from(*size);
            if *op == AluRmiROpcode::Mul {
                // We kinda freeloaded Mul into RMI_R_Op, but it doesn't fit the usual pattern, so
                // we have to special-case it.
@@ -191,26 +191,19 @@ pub(crate) fn emit(
                    }
                }
            } else {
-                let (opcode_r, opcode_m, subopcode_i, is_8bit) = match op {
-                    AluRmiROpcode::Add => (0x01, 0x03, 0, false),
-                    AluRmiROpcode::Adc => (0x11, 0x03, 0, false),
-                    AluRmiROpcode::Sub => (0x29, 0x2B, 5, false),
-                    AluRmiROpcode::Sbb => (0x19, 0x2B, 5, false),
-                    AluRmiROpcode::And => (0x21, 0x23, 4, false),
-                    AluRmiROpcode::Or => (0x09, 0x0B, 1, false),
-                    AluRmiROpcode::Xor => (0x31, 0x33, 6, false),
-                    AluRmiROpcode::And8 => (0x20, 0x22, 4, true),
-                    AluRmiROpcode::Or8 => (0x08, 0x0A, 1, true),
+                let (opcode_r, opcode_m, subopcode_i) = match op {
+                    AluRmiROpcode::Add => (0x01, 0x03, 0),
+                    AluRmiROpcode::Adc => (0x11, 0x03, 0),
+                    AluRmiROpcode::Sub => (0x29, 0x2B, 5),
+                    AluRmiROpcode::Sbb => (0x19, 0x2B, 5),
+                    AluRmiROpcode::And => (0x21, 0x23, 4),
+                    AluRmiROpcode::Or => (0x09, 0x0B, 1),
+                    AluRmiROpcode::Xor => (0x31, 0x33, 6),
                    AluRmiROpcode::Mul => panic!("unreachable"),
                };
-                assert!(!(is_8bit && *size == OperandSize::Size64));

                match src2 {
                    RegMemImm::Reg { reg: reg_e } => {
-                        if is_8bit {
-                            rex.always_emit_if_8bit_needed(reg_e);
-                            rex.always_emit_if_8bit_needed(reg_g);
-                        }
                        // GCC/llvm use the swapped operand encoding (viz., the R/RM vs RM/R
                        // duality). Do this too, so as to be able to compare generated machine
                        // code easily.
@@ -227,9 +220,6 @@ pub(crate) fn emit(

                    RegMemImm::Mem { addr } => {
                        let amode = addr.finalize(state, sink);
-                        if is_8bit {
-                            rex.always_emit_if_8bit_needed(reg_g);
-                        }
                        // Here we revert to the "normal" G-E ordering.
                        emit_std_reg_mem(
                            sink,
@@ -245,7 +235,6 @@ pub(crate) fn emit(
                    }

                    RegMemImm::Imm { simm32 } => {
-                        assert!(!is_8bit);
                        let use_imm8 = low8_will_sign_extend_to_32(simm32);
                        let opcode = if use_imm8 { 0x83 } else { 0x81 };
                        // And also here we use the "normal" G-E ordering.
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -1258,86 +1258,6 @@ fn test_x64_emit() {
        "4C09FA",
        "orq     %rdx, %r15, %rdx",
    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::And8,
-            RegMemImm::reg(r15),
-            w_rdx,
-        ),
-        "4420FA",
-        "andb    %dl, %r15b, %dl",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::And8,
-            RegMemImm::reg(rax),
-            w_rsi,
-        ),
-        "4020C6",
-        "andb    %sil, %al, %sil",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::And8,
-            RegMemImm::reg(rax),
-            w_rbx,
-        ),
-        "20C3",
-        "andb    %bl, %al, %bl",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::And8,
-            RegMemImm::mem(Amode::imm_reg(0, rax)),
-            w_rbx,
-        ),
-        "2218",
-        "andb    %bl, 0(%rax), %bl",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Or8,
-            RegMemImm::reg(r15),
-            w_rdx,
-        ),
-        "4408FA",
-        "orb     %dl, %r15b, %dl",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Or8,
-            RegMemImm::reg(rax),
-            w_rsi,
-        ),
-        "4008C6",
-        "orb     %sil, %al, %sil",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Or8,
-            RegMemImm::reg(rax),
-            w_rbx,
-        ),
-        "08C3",
-        "orb     %bl, %al, %bl",
-    ));
-    insns.push((
-        Inst::alu_rmi_r(
-            OperandSize::Size32,
-            AluRmiROpcode::Or8,
-            RegMemImm::mem(Amode::imm_reg(0, rax)),
-            w_rbx,
-        ),
-        "0A18",
-        "orb     %bl, 0(%rax), %bl",
-    ));
    insns.push((
        Inst::alu_rmi_r(
            OperandSize::Size64,
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -729,10 +729,6 @@ impl Inst {
        Inst::JmpKnown { dst }
    }

-    pub(crate) fn jmp_if(cc: CC, taken: MachLabel) -> Inst {
-        Inst::JmpIf { cc, taken }
-    }
-
    pub(crate) fn jmp_cond(cc: CC, taken: MachLabel, not_taken: MachLabel) -> Inst {
        Inst::JmpCond {
            cc,
@@ -892,23 +888,15 @@ impl PrettyPrint for Inst {
            .to_string()
        }

-        fn suffix_lqb(size: OperandSize, is_8: bool) -> String {
-            match (size, is_8) {
-                (_, true) => "b",
-                (OperandSize::Size32, false) => "l",
-                (OperandSize::Size64, false) => "q",
+        fn suffix_lqb(size: OperandSize) -> String {
+            match size {
+                OperandSize::Size32 => "l",
+                OperandSize::Size64 => "q",
                _ => unreachable!(),
            }
            .to_string()
        }

-        fn size_lqb(size: OperandSize, is_8: bool) -> u8 {
-            if is_8 {
-                return 1;
-            }
-            size.to_bytes()
-        }
-
        fn suffix_bwlq(size: OperandSize) -> String {
            match size {
                OperandSize::Size8 => "b".to_string(),
@@ -922,11 +910,10 @@ impl PrettyPrint for Inst {
            Inst::Nop { len } => format!("{} len={}", ljustify("nop".to_string()), len),

            Inst::AluRmiR { size, op, dst, .. } if self.produces_const() => {
-                let dst =
-                    pretty_print_reg(dst.to_reg().to_reg(), size_lqb(*size, op.is_8bit()), allocs);
+                let dst = pretty_print_reg(dst.to_reg().to_reg(), size.to_bytes(), allocs);
                format!(
                    "{} {}, {}, {}",
-                    ljustify2(op.to_string(), suffix_lqb(*size, op.is_8bit())),
+                    ljustify2(op.to_string(), suffix_lqb(*size)),
                    dst,
                    dst,
                    dst
@@ -939,13 +926,13 @@ impl PrettyPrint for Inst {
                src2,
                dst,
            } => {
-                let size_bytes = size_lqb(*size, op.is_8bit());
+                let size_bytes = size.to_bytes();
                let src1 = pretty_print_reg(src1.to_reg(), size_bytes, allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), size_bytes, allocs);
                let src2 = src2.pretty_print(size_bytes, allocs);
                format!(
                    "{} {}, {}, {}",
-                    ljustify2(op.to_string(), suffix_lqb(*size, op.is_8bit())),
+                    ljustify2(op.to_string(), suffix_lqb(*size)),
                    src1,
                    src2,
                    dst
@@ -957,12 +944,12 @@ impl PrettyPrint for Inst {
                src1_dst,
                src2,
            } => {
-                let size_bytes = size_lqb(*size, op.is_8bit());
+                let size_bytes = size.to_bytes();
                let src2 = pretty_print_reg(src2.to_reg(), size_bytes, allocs);
                let src1_dst = src1_dst.pretty_print(size_bytes, allocs);
                format!(
                    "{} {}, {}",
-                    ljustify2(op.to_string(), suffix_lqb(*size, op.is_8bit())),
+                    ljustify2(op.to_string(), suffix_lqb(*size)),
                    src2,
                    src1_dst,
                )
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2872,17 +2872,49 @@
 (rule (lower_branch (brif cc (ifcmp a b) _ _) (two_targets taken not_taken))
      (side_effect (jmp_cond_icmp (emit_cmp cc a b) taken not_taken)))

-;; Rules for `brz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Rules for `brff` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower_branch (brff cc (ffcmp a b) _ _) (two_targets taken not_taken))
+      (side_effect (jmp_cond_fcmp (emit_fcmp cc a b) taken not_taken)))
+
+;; Rules for `brz` and `brnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (rule (lower_branch (brz (icmp cc a b) _ _) (two_targets taken not_taken))
      (let ((cmp IcmpCondResult (invert_icmp_cond_result (emit_cmp cc a b))))
        (side_effect (jmp_cond_icmp cmp taken not_taken))))

-;; Rules for `brnz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+(rule (lower_branch (brz (fcmp cc a b) _ _) (two_targets taken not_taken))
+      (let ((cmp FcmpCondResult (emit_fcmp (floatcc_inverse cc) a b)))
+        (side_effect (jmp_cond_fcmp cmp taken not_taken))))
+
+(rule (lower_branch (brz val @ (value_type $I128) _ _) (two_targets taken not_taken))
+      (side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.NZ) val) taken not_taken)))

 (rule (lower_branch (brnz (icmp cc a b) _ _) (two_targets taken not_taken))
      (side_effect (jmp_cond_icmp (emit_cmp cc a b) taken not_taken)))

+(rule (lower_branch (brnz (fcmp cc a b) _ _) (two_targets taken not_taken))
+      (let ((cmp FcmpCondResult (emit_fcmp cc a b)))
+        (side_effect (jmp_cond_fcmp cmp taken not_taken))))
+
+(rule (lower_branch (brnz val @ (value_type $I128) _ _) (two_targets taken not_taken))
+      (side_effect (jmp_cond_icmp (cmp_zero_i128 (CC.Z) val) taken not_taken)))
+
+;; Compare an I128 value to zero, returning a flags result suitable for making a
+;; jump decision. The comparison is implemented as `(hi == 0) && (low == 0)`,
+;; and the result can be interpreted as follows
+;; * CC.Z indicates that the value was non-zero, as one or both of the halves of
+;;   the value were non-zero
+;; * CC.NZ indicates that both halves of the value were 0
+(decl cmp_zero_i128 (CC ValueRegs) IcmpCondResult)
+(rule (cmp_zero_i128 (cc_nz_or_z cc) val)
+      (let ((lo Gpr (value_regs_get_gpr val 0))
+            (hi Gpr (value_regs_get_gpr val 1))
+            (lo_z Gpr (with_flags_reg (x64_cmp (OperandSize.Size64) (RegMemImm.Imm 0) lo)
+                                      (x64_setcc (CC.Z))))
+            (hi_z Gpr (with_flags_reg (x64_cmp (OperandSize.Size64) (RegMemImm.Imm 0) hi)
+                                      (x64_setcc (CC.Z)))))
+          (icmp_cond_result (x64_test (OperandSize.Size8) lo_z hi_z) cc)))

 ;; Rules for `bricmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -5,7 +5,7 @@ pub(super) mod isle;

 use crate::data_value::DataValue;
 use crate::ir::{
-    condcodes::{CondCode, FloatCC, IntCC},
+    condcodes::{FloatCC, IntCC},
    types, ExternalName, Inst as IRInst, InstructionData, LibCall, Opcode, Type,
 };
 use crate::isa::x64::abi::*;
@@ -478,100 +478,6 @@ fn emit_cmp<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, cc: IntCC) -> IntC
    }
 }

-/// A specification for a fcmp emission.
-enum FcmpSpec {
-    /// Normal flow.
-    Normal,
-
-    /// Avoid emitting Equal at all costs by inverting it to NotEqual, and indicate when that
-    /// happens with `InvertedEqualOrConditions`.
-    ///
-    /// This is useful in contexts where it is hard/inefficient to produce a single instruction (or
-    /// sequence of instructions) that check for an "AND" combination of condition codes; see for
-    /// instance lowering of Select.
-    #[allow(dead_code)]
-    InvertEqual,
-}
-
-/// This explains how to interpret the results of an fcmp instruction.
-enum FcmpCondResult {
-    /// The given condition code must be set.
-    Condition(CC),
-
-    /// Both condition codes must be set.
-    AndConditions(CC, CC),
-
-    /// Either of the conditions codes must be set.
-    OrConditions(CC, CC),
-
-    /// The associated spec was set to `FcmpSpec::InvertEqual` and Equal has been inverted. Either
-    /// of the condition codes must be set, and the user must invert meaning of analyzing the
-    /// condition code results. When the spec is set to `FcmpSpec::Normal`, then this case can't be
-    /// reached.
-    InvertedEqualOrConditions(CC, CC),
-}
-
-/// Emits a float comparison instruction.
-///
-/// Note: make sure that there are no instructions modifying the flags between a call to this
-/// function and the use of the flags!
-fn emit_fcmp<C: LowerCtx<I = Inst>>(
-    ctx: &mut C,
-    insn: IRInst,
-    mut cond_code: FloatCC,
-    spec: FcmpSpec,
-) -> FcmpCondResult {
-    let (flip_operands, inverted_equal) = match cond_code {
-        FloatCC::LessThan
-        | FloatCC::LessThanOrEqual
-        | FloatCC::UnorderedOrGreaterThan
-        | FloatCC::UnorderedOrGreaterThanOrEqual => {
-            cond_code = cond_code.reverse();
-            (true, false)
-        }
-        FloatCC::Equal => {
-            let inverted_equal = match spec {
-                FcmpSpec::Normal => false,
-                FcmpSpec::InvertEqual => {
-                    cond_code = FloatCC::NotEqual; // same as .inverse()
-                    true
-                }
-            };
-            (false, inverted_equal)
-        }
-        _ => (false, false),
-    };
-
-    // The only valid CC constructed with `from_floatcc` can be put in the flag
-    // register with a direct float comparison; do this here.
-    let op = match ctx.input_ty(insn, 0) {
-        types::F32 => SseOpcode::Ucomiss,
-        types::F64 => SseOpcode::Ucomisd,
-        _ => panic!("Bad input type to Fcmp"),
-    };
-
-    let inputs = &[InsnInput { insn, input: 0 }, InsnInput { insn, input: 1 }];
-    let (lhs_input, rhs_input) = if flip_operands {
-        (inputs[1], inputs[0])
-    } else {
-        (inputs[0], inputs[1])
-    };
-    let lhs = put_input_in_reg(ctx, lhs_input);
-    let rhs = input_to_reg_mem(ctx, rhs_input);
-    ctx.emit(Inst::xmm_cmp_rm_r(op, rhs, lhs));
-
-    let cond_result = match cond_code {
-        FloatCC::Equal => FcmpCondResult::AndConditions(CC::NP, CC::Z),
-        FloatCC::NotEqual if inverted_equal => {
-            FcmpCondResult::InvertedEqualOrConditions(CC::P, CC::NZ)
-        }
-        FloatCC::NotEqual if !inverted_equal => FcmpCondResult::OrConditions(CC::P, CC::NZ),
-        _ => FcmpCondResult::Condition(CC::from_floatcc(cond_code)),
-    };
-
-    cond_result
-}
-
 fn emit_vm_call<C: LowerCtx<I = Inst>>(
    ctx: &mut C,
    flags: &Flags,
@@ -2878,61 +2784,10 @@ impl LowerBackend for X64Backend {

                    if let Some(_icmp) = matches_input(ctx, flag_input, Opcode::Icmp) {
                        implemented_in_isle(ctx)
-                    } else if let Some(fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
-                        let cond_code = ctx.data(fcmp).fp_cond_code().unwrap();
-                        let cond_code = if op0 == Opcode::Brz {
-                            cond_code.inverse()
-                        } else {
-                            cond_code
-                        };
-                        match emit_fcmp(ctx, fcmp, cond_code, FcmpSpec::Normal) {
-                            FcmpCondResult::Condition(cc) => {
-                                ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
-                            }
-                            FcmpCondResult::AndConditions(cc1, cc2) => {
-                                ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
-                                ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
-                            }
-                            FcmpCondResult::OrConditions(cc1, cc2) => {
-                                ctx.emit(Inst::jmp_if(cc1, taken));
-                                ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
-                            }
-                            FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
-                        }
+                    } else if let Some(_fcmp) = matches_input(ctx, flag_input, Opcode::Fcmp) {
+                        implemented_in_isle(ctx)
                    } else if src_ty == types::I128 {
-                        let src = put_input_in_regs(
-                            ctx,
-                            InsnInput {
-                                insn: branches[0],
-                                input: 0,
-                            },
-                        );
-                        let (half_cc, comb_op) = match op0 {
-                            Opcode::Brz => (CC::Z, AluRmiROpcode::And8),
-                            Opcode::Brnz => (CC::NZ, AluRmiROpcode::Or8),
-                            _ => unreachable!(),
-                        };
-                        let tmp1 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                        let tmp2 = ctx.alloc_tmp(types::I64).only_reg().unwrap();
-                        ctx.emit(Inst::cmp_rmi_r(
-                            OperandSize::Size64,
-                            RegMemImm::imm(0),
-                            src.regs()[0],
-                        ));
-                        ctx.emit(Inst::setcc(half_cc, tmp1));
-                        ctx.emit(Inst::cmp_rmi_r(
-                            OperandSize::Size64,
-                            RegMemImm::imm(0),
-                            src.regs()[1],
-                        ));
-                        ctx.emit(Inst::setcc(half_cc, tmp2));
-                        ctx.emit(Inst::alu_rmi_r(
-                            OperandSize::Size32,
-                            comb_op,
-                            RegMemImm::reg(tmp1.to_reg()),
-                            tmp2,
-                        ));
-                        ctx.emit(Inst::jmp_cond(CC::NZ, taken, not_taken));
+                        implemented_in_isle(ctx);
                    } else if is_int_or_ref_ty(src_ty) || is_bool_ty(src_ty) {
                        let src = put_input_in_reg(
                            ctx,
@@ -2968,34 +2823,7 @@ impl LowerBackend for X64Backend {
                    }
                }

-                Opcode::BrIcmp | Opcode::Brif => implemented_in_isle(ctx),
-                Opcode::Brff => {
-                    let flag_input = InsnInput {
-                        insn: branches[0],
-                        input: 0,
-                    };
-
-                    if let Some(ffcmp) = matches_input(ctx, flag_input, Opcode::Ffcmp) {
-                        let cond_code = ctx.data(branches[0]).fp_cond_code().unwrap();
-                        match emit_fcmp(ctx, ffcmp, cond_code, FcmpSpec::Normal) {
-                            FcmpCondResult::Condition(cc) => {
-                                ctx.emit(Inst::jmp_cond(cc, taken, not_taken));
-                            }
-                            FcmpCondResult::AndConditions(cc1, cc2) => {
-                                ctx.emit(Inst::jmp_if(cc1.invert(), not_taken));
-                                ctx.emit(Inst::jmp_cond(cc2.invert(), not_taken, taken));
-                            }
-                            FcmpCondResult::OrConditions(cc1, cc2) => {
-                                ctx.emit(Inst::jmp_if(cc1, taken));
-                                ctx.emit(Inst::jmp_cond(cc2, taken, not_taken));
-                            }
-                            FcmpCondResult::InvertedEqualOrConditions(_, _) => unreachable!(),
-                        }
-                    } else {
-                        // Should be disallowed by flags checks in verifier.
-                        unimplemented!("Brff with input not from ffcmp");
-                    }
-                }
+                Opcode::BrIcmp | Opcode::Brif | Opcode::Brff => implemented_in_isle(ctx),

                _ => panic!("unexpected branch opcode: {:?}", op0),
            }
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -14,7 +14,7 @@ use crate::ir::LibCall;
 use crate::isa::x64::lower::emit_vm_call;
 use crate::{
    ir::{
-        condcodes::{FloatCC, IntCC},
+        condcodes::{CondCode, FloatCC, IntCC},
        immediates::*,
        types::*,
        Inst, InstructionData, MemFlags, Opcode, TrapCode, Value, ValueList,
@@ -590,6 +590,20 @@ where
        cc.invert()
    }

+    #[inline]
+    fn cc_nz_or_z(&mut self, cc: &CC) -> Option<CC> {
+        match cc {
+            CC::Z => Some(*cc),
+            CC::NZ => Some(*cc),
+            _ => None,
+        }
+    }
+
+    #[inline]
+    fn floatcc_inverse(&mut self, cc: &FloatCC) -> FloatCC {
+        cc.inverse()
+    }
+
    #[inline]
    fn sum_extend_fits_in_32_bits(
        &mut self,
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -553,7 +553,10 @@
 (type SideEffectNoResult (enum
                          (Inst (inst MInst))
                          (Inst2 (inst1 MInst)
-                                 (inst2 MInst))))
+                                 (inst2 MInst))
+                          (Inst3 (inst1 MInst)
+                                 (inst2 MInst)
+                                 (inst3 MInst))))

 ;; Create an empty `InstOutput`, but do emit the given side-effectful
 ;; instruction.
@@ -565,10 +568,19 @@
      (let ((_ Unit (emit inst1))
            (_ Unit (emit inst2)))
        (output_none)))
+(rule (side_effect (SideEffectNoResult.Inst3 inst1 inst2 inst3))
+      (let ((_ Unit (emit inst1))
+            (_ Unit (emit inst2))
+            (_ Unit (emit inst3)))
+        (output_none)))

 (decl side_effect_concat (SideEffectNoResult SideEffectNoResult) SideEffectNoResult)
 (rule (side_effect_concat (SideEffectNoResult.Inst inst1) (SideEffectNoResult.Inst inst2))
      (SideEffectNoResult.Inst2 inst1 inst2))
+(rule (side_effect_concat (SideEffectNoResult.Inst inst1) (SideEffectNoResult.Inst2 inst2 inst3))
+      (SideEffectNoResult.Inst3 inst1 inst2 inst3))
+(rule (side_effect_concat (SideEffectNoResult.Inst2 inst1 inst2) (SideEffectNoResult.Inst inst3))
+      (SideEffectNoResult.Inst3 inst1 inst2 inst3))

 ;;;; Helpers for Working with Flags ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -595,6 +607,7 @@
 ;; ProducesFlags. See `with_flags` below for more.
 (type ConsumesFlags (enum
                     (ConsumesFlagsSideEffect (inst MInst))
+                     (ConsumesFlagsSideEffect2 (inst1 MInst) (inst2 MInst))
                     (ConsumesFlagsReturnsResultWithProducer (inst MInst) (result Reg))
                     (ConsumesFlagsReturnsReg (inst MInst) (result Reg))
                     (ConsumesFlagsTwiceReturnsValueRegs (inst1 MInst)
@@ -630,6 +643,10 @@
       inst1
       inst2
       (value_regs reg1 reg2)))
+(rule (consumes_flags_concat
+        (ConsumesFlags.ConsumesFlagsSideEffect inst1)
+        (ConsumesFlags.ConsumesFlagsSideEffect inst2))
+      (ConsumesFlags.ConsumesFlagsSideEffect2 inst1 inst2))

 ;; Combine flags-producing and -consuming instructions together, ensuring that
 ;; they are emitted back-to-back and no other instructions can be emitted
@@ -707,11 +724,21 @@
        (ConsumesFlags.ConsumesFlagsSideEffect c))
      (SideEffectNoResult.Inst c))

+(rule (with_flags_side_effect
+        (ProducesFlags.AlreadyExistingFlags)
+        (ConsumesFlags.ConsumesFlagsSideEffect2 c1 c2))
+      (SideEffectNoResult.Inst2 c1 c2))
+
 (rule (with_flags_side_effect
        (ProducesFlags.ProducesFlagsSideEffect p)
        (ConsumesFlags.ConsumesFlagsSideEffect c))
      (SideEffectNoResult.Inst2 p c))

+(rule (with_flags_side_effect
+        (ProducesFlags.ProducesFlagsSideEffect p)
+        (ConsumesFlags.ConsumesFlagsSideEffect2 c1 c2))
+      (SideEffectNoResult.Inst3 p c1 c2))
+
 ;;;; Helpers for Working with TrapCode ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (decl trap_code_division_by_zero () TrapCode)
--- a/cranelift/filetests/filetests/isa/x64/branches.clif
+++ b/cranelift/filetests/filetests/isa/x64/branches.clif
@@ -126,3 +126,62 @@ block2:
 ;   popq    %rbp
 ;   ret

+function %f4(f32, f32) -> b1 {
+block0(v0: f32, v1: f32):
+  v2 = fcmp eq v0, v1
+  brz v2, block1
+  jump block2
+block1:
+  v3 = bconst.b1 true
+  return v3
+block2:
+  v4 = bconst.b1 false
+  return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   ucomiss %xmm1, %xmm0
+;   jp      label1
+;   jnz     label1; j label2
+; block1:
+;   movl    $1, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; block2:
+;   xorl    %eax, %eax, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+
+function %f4(f32, f32) -> b1 {
+block0(v0: f32, v1: f32):
+  v2 = fcmp ne v0, v1
+  brz v2, block1
+  jump block2
+block1:
+  v3 = bconst.b1 true
+  return v3
+block2:
+  v4 = bconst.b1 false
+  return v4
+}
+
+;   pushq   %rbp
+;   movq    %rsp, %rbp
+; block0:
+;   ucomiss %xmm1, %xmm0
+;   jp      label2
+;   jnz     label2; j label1
+; block1:
+;   movl    $1, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
+; block2:
+;   xorl    %eax, %eax, %eax
+;   movq    %rbp, %rsp
+;   popq    %rbp
+;   ret
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -320,7 +320,7 @@ block2:
 ;   setz    %r11b
 ;   cmpq    $0, %rsi
 ;   setz    %al
-;   andb    %al, %r11b, %al
+;   testb   %r11b, %al
 ;   jnz     label1; j label2
 ; block1:
 ;   movl    $1, %eax
@@ -351,11 +351,11 @@ block2:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   cmpq    $0, %rdi
-;   setnz   %r11b
+;   setz    %r11b
 ;   cmpq    $0, %rsi
-;   setnz   %al
-;   orb     %al, %r11b, %al
-;   jnz     label1; j label2
+;   setz    %al
+;   testb   %r11b, %al
+;   jz      label1; j label2
 ; block1:
 ;   movl    $1, %eax
 ;   movq    %rbp, %rsp
--- a/cranelift/filetests/filetests/runtests/fcmp.clif
+++ b/cranelift/filetests/filetests/runtests/fcmp.clif
@@ -0,0 +1,62 @@
+test run
+target aarch64
+target s390x
+target x86_64
+
+function %fcmp_eq(f64, f64) -> b1 {
+block0(v0: f64, v1: f64):
+    v2 = fcmp eq v0, v1
+    return v2
+}
+
+; run: %fcmp_eq(0x1.0, 0x1.0) == true
+; run: %fcmp_eq(0x1.0, 0x0.0) == false
+
+function %fcmp_ne(f64, f64) -> b1 {
+block0(v0: f64, v1: f64):
+    v2 = fcmp ne v0, v1
+    return v2
+}
+
+; run: %fcmp_ne(0x1.0, 0x1.0) == false
+; run: %fcmp_ne(0x1.0, 0x0.0) == true
+
+function %fcmp_lt(f64, f64) -> b1 {
+block0(v0: f64, v1: f64):
+    v2 = fcmp lt v0, v1
+    return v2
+}
+
+; run: %fcmp_lt(0x1.0, 0x1.0) == false
+; run: %fcmp_lt(0x1.0, 0x0.0) == false
+; run: %fcmp_lt(0x1.0, 0x2.3) == true
+
+function %fcmp_le(f64, f64) -> b1 {
+block0(v0: f64, v1: f64):
+    v2 = fcmp le v0, v1
+    return v2
+}
+
+; run: %fcmp_le(0x1.0, 0x1.0) == true
+; run: %fcmp_le(0x1.0, 0x0.0) == false
+; run: %fcmp_le(0x1.0, 0x2.3) == true
+
+function %fcmp_gt(f64, f64) -> b1 {
+block0(v0: f64, v1: f64):
+    v2 = fcmp gt v0, v1
+    return v2
+}
+
+; run: %fcmp_gt(0x1.0, 0x1.0) == false
+; run: %fcmp_gt(0x1.0, 0x0.0) == true
+; run: %fcmp_gt(0x1.0, 0x2.3) == false
+
+function %fcmp_ge(f64, f64) -> b1 {
+block0(v0: f64, v1: f64):
+    v2 = fcmp ge v0, v1
+    return v2
+}
+
+; run: %fcmp_ge(0x1.0, 0x1.0) == true
+; run: %fcmp_ge(0x1.0, 0x0.0) == true
+; run: %fcmp_ge(0x1.0, 0x2.3) == false