x64: port icmp to ISLE (#3886)

* x64: port GPR-held `icmp` to ISLE * x64: port equality `icmp` for i128 type * x64: port `icmp` for vector types * x64: rename from_intcc to intcc_to_cc
2022-03-18 11:22:09 -07:00
parent 8cfb552090
commit e92cbfb283
8 changed files with 1145 additions and 549 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -784,6 +784,7 @@
 (decl put_masked_in_imm8_gpr (Value Type) Imm8Gpr)
 (extern constructor put_masked_in_imm8_gpr put_masked_in_imm8_gpr)

+;; Condition codes
 (type CC extern
      (enum O
            NO
@@ -801,6 +802,8 @@
            NLE
            P
            NP))
+(decl intcc_to_cc (IntCC) CC)
+(extern constructor intcc_to_cc intcc_to_cc)

 (type Avx512Opcode extern
      (enum Vcvtudq2ps
@@ -1362,6 +1365,16 @@
                 src1
                 src2))

+(decl x64_and_with_flags_paired (Type Gpr GprMemImm) ProducesFlags)
+(rule (x64_and_with_flags_paired ty src1 src2)
+      (let ((dst WritableGpr (temp_writable_gpr)))
+            (ProducesFlags.ProducesFlagsSideEffect
+                  (MInst.AluRmiR (operand_size_of_type_32_64 ty)
+                        (AluRmiROpcode.And)
+                        src1
+                        src2
+                        dst))))
+
 ;; Helper for emitting `or` instructions.
 (decl or (Type Gpr GprMemImm) Gpr)
 (rule (or ty src1 src2)
@@ -1992,65 +2005,57 @@
 (rule (movlhps src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Movlhps) src1 src2))

-;; Helper for creating `pmaxsb` instructions.
+;; Helpers for creating `pmaxs*` instructions.
+(decl pmaxs (Type Xmm XmmMem) Xmm)
+(rule (pmaxs $I8X16 x y) (pmaxsb x y))
+(rule (pmaxs $I16X8 x y) (pmaxsw x y))
+(rule (pmaxs $I32X4 x y) (pmaxsd x y))
+;; No $I64X2 version (PMAXSQ) in SSE4.1.
 (decl pmaxsb (Xmm XmmMem) Xmm)
-(rule (pmaxsb src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pmaxsb) src1 src2))
-
-;; Helper for creating `pmaxsw` instructions.
+(rule (pmaxsb src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pmaxsb) src1 src2))
 (decl pmaxsw (Xmm XmmMem) Xmm)
-(rule (pmaxsw src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pmaxsw) src1 src2))
-
-;; Helper for creating `pmaxsd` instructions.
+(rule (pmaxsw src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pmaxsw) src1 src2))
 (decl pmaxsd (Xmm XmmMem) Xmm)
-(rule (pmaxsd src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pmaxsd) src1 src2))
+(rule (pmaxsd src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pmaxsd) src1 src2))

-;; Helper for creating `pminsb` instructions.
+;; Helpers for creating `pmins*` instructions.
+(decl pmins (Type Xmm XmmMem) Xmm)
+(rule (pmins $I8X16 x y) (pminsb x y))
+(rule (pmins $I16X8 x y) (pminsw x y))
+(rule (pmins $I32X4 x y) (pminsd x y))
+;; No $I64X2 version (PMINSQ) in SSE4.1.
 (decl pminsb (Xmm XmmMem) Xmm)
-(rule (pminsb src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pminsb) src1 src2))
-
-;; Helper for creating `pminsw` instructions.
+(rule (pminsb src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pminsb) src1 src2))
 (decl pminsw (Xmm XmmMem) Xmm)
-(rule (pminsw src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pminsw) src1 src2))
-
-;; Helper for creating `pminsd` instructions.
+(rule (pminsw src1 src2) (xmm_rm_r $I16X8 (SseOpcode.Pminsw) src1 src2))
 (decl pminsd (Xmm XmmMem) Xmm)
-(rule (pminsd src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pminsd) src1 src2))
+(rule (pminsd src1 src2) (xmm_rm_r $I32X4 (SseOpcode.Pminsd) src1 src2))

-;; Helper for creating `pmaxub` instructions.
+;; Helpers for creating `pmaxu*` instructions.
+(decl pmaxu (Type Xmm XmmMem) Xmm)
+(rule (pmaxu $I8X16 x y) (pmaxub x y))
+(rule (pmaxu $I16X8 x y) (pmaxuw x y))
+(rule (pmaxu $I32X4 x y) (pmaxud x y))
+;; No $I64X2 version (PMAXUQ) in SSE4.1.
 (decl pmaxub (Xmm XmmMem) Xmm)
-(rule (pmaxub src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pmaxub) src1 src2))
-
-;; Helper for creating `pmaxuw` instructions.
+(rule (pmaxub src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pmaxub) src1 src2))
 (decl pmaxuw (Xmm XmmMem) Xmm)
-(rule (pmaxuw src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pmaxuw) src1 src2))
-
-;; Helper for creating `pmaxud` instructions.
+(rule (pmaxuw src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pmaxuw) src1 src2))
 (decl pmaxud (Xmm XmmMem) Xmm)
-(rule (pmaxud src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pmaxud) src1 src2))
+(rule (pmaxud src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pmaxud) src1 src2))

-;; Helper for creating `pminub` instructions.
+;; Helper for creating `pminu*` instructions.
+(decl pminu (Type Xmm XmmMem) Xmm)
+(rule (pminu $I8X16 x y) (pminub x y))
+(rule (pminu $I16X8 x y) (pminuw x y))
+(rule (pminu $I32X4 x y) (pminud x y))
+;; No $I64X2 version (PMINUQ) in SSE4.1.
 (decl pminub (Xmm XmmMem) Xmm)
-(rule (pminub src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pminub) src1 src2))
-
-;; Helper for creating `pminuw` instructions.
+(rule (pminub src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pminub) src1 src2))
 (decl pminuw (Xmm XmmMem) Xmm)
-(rule (pminuw src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pminuw) src1 src2))
-
-;; Helper for creating `pminud` instructions.
+(rule (pminuw src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pminuw) src1 src2))
 (decl pminud (Xmm XmmMem) Xmm)
-(rule (pminud src1 src2)
-      (xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))
+(rule (pminud src1 src2) (xmm_rm_r $I8X16 (SseOpcode.Pminud) src1 src2))

 ;; Helper for creating `punpcklbw` instructions.
 (decl punpcklbw (Xmm XmmMem) Xmm)
@@ -2498,6 +2503,38 @@
            (_ Unit (emit (MInst.XmmRmR (SseOpcode.Maxpd) x y dst))))
        dst))

+;; Helpers for creating `pcmpeq*` instructions.
+(decl pcmpeq (Type Xmm XmmMem) Xmm)
+(rule (pcmpeq $I8X16 x y) (pcmpeqb x y))
+(rule (pcmpeq $I16X8 x y) (pcmpeqw x y))
+(rule (pcmpeq $I32X4 x y) (pcmpeqd x y))
+(rule (pcmpeq $I64X2 x y) (pcmpeqq x y))
+
+(decl pcmpeqb (Xmm XmmMem) Xmm)
+(rule (pcmpeqb x y) (xmm_rm_r $I8X16 (SseOpcode.Pcmpeqb) x y))
+(decl pcmpeqw (Xmm XmmMem) Xmm)
+(rule (pcmpeqw x y) (xmm_rm_r $I16X8 (SseOpcode.Pcmpeqw) x y))
+(decl pcmpeqd (Xmm XmmMem) Xmm)
+(rule (pcmpeqd x y) (xmm_rm_r $I32X4 (SseOpcode.Pcmpeqd) x y))
+(decl pcmpeqq (Xmm XmmMem) Xmm)
+(rule (pcmpeqq x y) (xmm_rm_r $I64X2 (SseOpcode.Pcmpeqq) x y))
+
+;; Helpers for creating `pcmpgt*` instructions.
+(decl pcmpgt (Type Xmm XmmMem) Xmm)
+(rule (pcmpgt $I8X16 x y) (pcmpgtb x y))
+(rule (pcmpgt $I16X8 x y) (pcmpgtw x y))
+(rule (pcmpgt $I32X4 x y) (pcmpgtd x y))
+(rule (pcmpgt $I64X2 x y) (pcmpgtq x y))
+
+(decl pcmpgtb (Xmm XmmMem) Xmm)
+(rule (pcmpgtb x y) (xmm_rm_r $I8X16 (SseOpcode.Pcmpgtb) x y))
+(decl pcmpgtw (Xmm XmmMem) Xmm)
+(rule (pcmpgtw x y) (xmm_rm_r $I16X8 (SseOpcode.Pcmpgtw) x y))
+(decl pcmpgtd (Xmm XmmMem) Xmm)
+(rule (pcmpgtd x y) (xmm_rm_r $I32X4 (SseOpcode.Pcmpgtd) x y))
+(decl pcmpgtq (Xmm XmmMem) Xmm)
+(rule (pcmpgtq x y) (xmm_rm_r $I64X2 (SseOpcode.Pcmpgtq) x y))
+
 ;;;; Automatic conversions ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (convert Gpr InstOutput output_gpr)
@@ -2547,6 +2584,8 @@
 (convert SyntheticAmode GprMem synthetic_amode_to_gpr_mem)
 (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)

+(convert IntCC CC intcc_to_cc)
+
 (decl reg_to_xmm_mem (Reg) XmmMem)
 (rule (reg_to_xmm_mem r)
      (xmm_to_xmm_mem (xmm_new r)))
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1440,6 +1440,107 @@
 (rule (lower (resumable_trap code))
      (safepoint (ud2 code)))

+;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; For GPR-held values we only need to emit `CMP + SETCC`. We rely here on
+;; Cranelift's verification that `a` and `b` are of the same type.
+;; Unfortunately for clarity, the registers are flipped here (TODO).
+(rule (lower (icmp cc a @ (value_type (fits_in_64 ty)) b))
+      (let ((size OperandSize (raw_operand_size_of_type ty)))
+           (with_flags (cmp size b a) (setcc cc))))
+
+;; For XMM-held values, we lower to `PCMP*` instructions, sometimes more than
+;; one. To note: what is different here about the output values is that each
+;; lane will be filled with all 1s or all 0s according to the comparison,
+;; whereas for GPR-held values, the result will be simply 0 or 1 (upper bits
+;; unset).
+(rule (lower (icmp (IntCC.Equal) a @ (value_type (vec128 ty)) b))
+      (pcmpeq ty a b))
+;; To lower a not-equals comparison, we perform an equality comparison
+;; (PCMPEQ*) and then invert the bits (PXOR with all 1s).
+(rule (lower (icmp (IntCC.NotEqual) a @ (value_type (vec128 ty)) b))
+      (let ((checked Xmm (pcmpeq ty a b))
+            (all_ones Xmm (vector_all_ones ty)))
+           (pxor checked all_ones)))
+;; Signed comparisons have a single-instruction lowering, unlike their unsigned
+;; counterparts. These latter instructions use the unsigned min/max
+;; (PMINU*/PMAXU*) and negate the result (PXOR with all 1s).
+(rule (lower (icmp (IntCC.SignedGreaterThan) a @ (value_type (vec128 ty)) b))
+      (pcmpgt ty a b))
+(rule (lower (icmp (IntCC.SignedLessThan) a @ (value_type (vec128 ty)) b))
+      (pcmpgt ty b a))
+(rule (lower (icmp (IntCC.UnsignedGreaterThan) a @ (value_type (vec128 ty)) b))
+      (let ((max Xmm (pmaxu ty a b))
+            (eq Xmm (pcmpeq ty max b))
+            (all_ones Xmm (vector_all_ones ty)))
+           (pxor eq all_ones)))
+(rule (lower (icmp (IntCC.UnsignedLessThan) a @ (value_type (vec128 ty)) b))
+      (let ((min Xmm (pminu ty a b))
+            (eq Xmm (pcmpeq ty min b))
+            (all_ones Xmm (vector_all_ones ty)))
+           (pxor eq all_ones)))
+;; To lower signed and unsigned *-or-equals comparisons, we find the minimum
+;; number (PMIN[U|S]*) and compare that to one of the terms (PCMPEQ*). Note that
+;; there is no 64x2 version of this lowering (see below).
+(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((max Xmm (pmaxs ty a b)))
+           (pcmpeq ty a max)))
+(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((min Xmm (pmins ty a b)))
+           (pcmpeq ty a min)))
+(rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((max Xmm (pmaxu ty a b)))
+           (pcmpeq ty a max)))
+(rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type (vec128 ty)) b))
+      (let ((min Xmm (pminu ty a b)))
+           (pcmpeq ty a min)))
+;; The PMIN[S|U]Q instruction is only available in AVX512VL/F so we must instead
+;; compare with flipped operands (PCMPGT*) and negate the result (PXOR with all
+;; 1s), emitting one more instruction than the smaller-lane versions.
+(rule (lower (icmp (IntCC.SignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
+      (let ((checked Xmm (pcmpgt $I64X2 b a))
+            (all_ones Xmm (vector_all_ones $I64X2)))
+           (pxor checked all_ones)))
+(rule (lower (icmp (IntCC.SignedLessThanOrEqual) a @ (value_type $I64X2) b))
+      (let ((checked Xmm (pcmpgt $I64X2 a b))
+            (all_ones Xmm (vector_all_ones $I64X2)))
+           (pxor checked all_ones)))
+;; TODO: not used by WebAssembly translation
+;; (rule (lower (icmp (IntCC.UnsignedGreaterThanOrEqual) a @ (value_type $I64X2) b))
+;; TODO: not used by WebAssembly translation
+;; (rule (lower (icmp (IntCC.UnsignedLessThanOrEqual) a @ (value_type $I64X2) b))
+
+;; For I128 values (held in two GPRs), the instruction sequences depend on what
+;; kind of condition is tested.
+(rule (lower (icmp (IntCC.Equal) a @ (value_type $I128) b))
+      (let ((a_lo Gpr (value_regs_get_gpr a 0))
+            (a_hi Gpr (value_regs_get_gpr a 1))
+            (b_lo Gpr (value_regs_get_gpr b 0))
+            (b_hi Gpr (value_regs_get_gpr b 1))
+            (cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.Z))))
+            (cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.Z))))
+            ;; At this point, `cmp_lo` and `cmp_hi` contain either 0 or 1 in the
+            ;; lowest 8 bits--`SETcc` guarantees this. The upper bits may be
+            ;; unchanged so we must compare against 1; this instruction combines
+            ;; `cmp_lo` and `cmp_hi` for that final comparison.
+            (cmp Reg (x64_and $I64 cmp_lo cmp_hi)))
+           ;; We can use the flag-setting behavior of `AND` to set the final
+           ;; bits. If the result of `AND` is zero, then the `ZF` will be set;
+           ;; if either of the halves `AND`s to 0, they were not equal,
+           ;; therefore we `SETcc` with `NZ`.
+           (with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
+
+(rule (lower (icmp (IntCC.NotEqual) a @ (value_type $I128) b))
+      (let ((a_lo Gpr (value_regs_get_gpr a 0))
+            (a_hi Gpr (value_regs_get_gpr a 1))
+            (b_lo Gpr (value_regs_get_gpr b 0))
+            (b_hi Gpr (value_regs_get_gpr b 1))
+            (cmp_lo Reg (with_flags_reg (cmp (OperandSize.Size64) b_lo a_lo) (setcc (CC.NZ))))
+            (cmp_hi Reg (with_flags_reg (cmp (OperandSize.Size64) b_hi a_hi) (setcc (CC.NZ))))
+            ;; See comments for `IntCC.Equal`.
+            (cmp Reg (or $I64 cmp_lo cmp_hi)))
+           (with_flags (x64_and_with_flags_paired $I64 cmp (RegMemImm.Imm 1)) (setcc (CC.NZ)))))
+
 ;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; CLIF `select` instructions receive a testable argument (i.e. boolean or
@@ -1715,8 +1816,8 @@
            (mul Gpr (mul $I32 masked4 (RegMemImm.Imm 0x01010101)))
            (final Gpr (shr $I32 mul (Imm8Reg.Imm8 24))))
        final))
-                       
-            
+
+
 (rule 1 (lower (has_type (and
                          $I8X16
                          (avx512vl_enabled)
@@ -1725,7 +1826,7 @@
      (vpopcntb src))


-      
+
 ;; For SSE 4.2 we use Mula's algorithm (https://arxiv.org/pdf/1611.07612.pdf):
 ;;
 ;; __m128i count_bytes ( __m128i v) {
@@ -1807,7 +1908,7 @@
                           (shl ty lo4 (Imm8Reg.Imm8 4))
                           hi4)))
        swap4))
-                       
+
 (decl do_bitrev16 (Type Gpr) Gpr)
 (rule (do_bitrev16 ty src)
      (let ((src_ Gpr (do_bitrev8 ty src))
@@ -1819,7 +1920,7 @@
                           (shl ty lo8 (Imm8Reg.Imm8 8))
                           hi8)))
        swap8))
-      
+
 (decl do_bitrev32 (Type Gpr) Gpr)
 (rule (do_bitrev32 ty src)
      (let ((src_ Gpr (do_bitrev16 ty src))
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -920,145 +920,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
            let condcode = ctx.data(insn).cond_code().unwrap();
            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            let ty = ctx.input_ty(insn, 0);
-            if !ty.is_vector() {
+            if ty == types::I128 && condcode != IntCC::Equal && condcode != IntCC::NotEqual {
                let condcode = emit_cmp(ctx, insn, condcode);
                let cc = CC::from_intcc(condcode);
                ctx.emit(Inst::setcc(cc, dst));
            } else {
-                assert_eq!(ty.bits(), 128);
-                let eq = |ty| match ty {
-                    types::I8X16 => SseOpcode::Pcmpeqb,
-                    types::I16X8 => SseOpcode::Pcmpeqw,
-                    types::I32X4 => SseOpcode::Pcmpeqd,
-                    types::I64X2 => SseOpcode::Pcmpeqq,
-                    _ => panic!(
-                        "Unable to find an instruction for {} for type: {}",
-                        condcode, ty
-                    ),
-                };
-                let gt = |ty| match ty {
-                    types::I8X16 => SseOpcode::Pcmpgtb,
-                    types::I16X8 => SseOpcode::Pcmpgtw,
-                    types::I32X4 => SseOpcode::Pcmpgtd,
-                    types::I64X2 => SseOpcode::Pcmpgtq,
-                    _ => panic!(
-                        "Unable to find an instruction for {} for type: {}",
-                        condcode, ty
-                    ),
-                };
-                let maxu = |ty| match ty {
-                    types::I8X16 => SseOpcode::Pmaxub,
-                    types::I16X8 => SseOpcode::Pmaxuw,
-                    types::I32X4 => SseOpcode::Pmaxud,
-                    _ => panic!(
-                        "Unable to find an instruction for {} for type: {}",
-                        condcode, ty
-                    ),
-                };
-                let mins = |ty| match ty {
-                    types::I8X16 => SseOpcode::Pminsb,
-                    types::I16X8 => SseOpcode::Pminsw,
-                    types::I32X4 => SseOpcode::Pminsd,
-                    _ => panic!(
-                        "Unable to find an instruction for {} for type: {}",
-                        condcode, ty
-                    ),
-                };
-                let minu = |ty| match ty {
-                    types::I8X16 => SseOpcode::Pminub,
-                    types::I16X8 => SseOpcode::Pminuw,
-                    types::I32X4 => SseOpcode::Pminud,
-                    _ => panic!(
-                        "Unable to find an instruction for {} for type: {}",
-                        condcode, ty
-                    ),
-                };
-
-                // Here we decide which operand to use as the read/write `dst` (ModRM reg field) and
-                // which to use as the read `input` (ModRM r/m field). In the normal case we use
-                // Cranelift's first operand, the `lhs`, as `dst` but we flip the operands for the
-                // less-than cases so that we can reuse the greater-than implementation.
-                //
-                // In a surprising twist, the operands for i64x2 `gte`/`sle` must also be flipped
-                // from the normal order because of the special-case lowering for these instructions
-                // (i.e. we use PCMPGTQ with flipped operands and negate the result).
-                let input = match condcode {
-                    IntCC::SignedLessThanOrEqual if ty == types::I64X2 => {
-                        let lhs = put_input_in_reg(ctx, inputs[0]);
-                        let rhs = input_to_reg_mem(ctx, inputs[1]);
-                        ctx.emit(Inst::gen_move(dst, lhs, ty));
-                        rhs
-                    }
-                    IntCC::SignedGreaterThanOrEqual if ty == types::I64X2 => {
-                        let lhs = input_to_reg_mem(ctx, inputs[0]);
-                        let rhs = put_input_in_reg(ctx, inputs[1]);
-                        ctx.emit(Inst::gen_move(dst, rhs, ty));
-                        lhs
-                    }
-                    IntCC::SignedLessThan
-                    | IntCC::SignedLessThanOrEqual
-                    | IntCC::UnsignedLessThan
-                    | IntCC::UnsignedLessThanOrEqual => {
-                        let lhs = input_to_reg_mem(ctx, inputs[0]);
-                        let rhs = put_input_in_reg(ctx, inputs[1]);
-                        ctx.emit(Inst::gen_move(dst, rhs, ty));
-                        lhs
-                    }
-                    _ => {
-                        let lhs = put_input_in_reg(ctx, inputs[0]);
-                        let rhs = input_to_reg_mem(ctx, inputs[1]);
-                        ctx.emit(Inst::gen_move(dst, lhs, ty));
-                        rhs
-                    }
-                };
-
-                match condcode {
-                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
-                    IntCC::NotEqual => {
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
-                        // Emit all 1s into the `tmp` register.
-                        let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
-                        // Invert the result of the `PCMPEQ*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
-                    }
-                    IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
-                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
-                    }
-                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
-                        if ty != types::I64X2 =>
-                    {
-                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
-                    }
-                    IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual
-                        if ty == types::I64X2 =>
-                    {
-                        // The PMINS* instruction is only available in AVX512VL/F so we must instead
-                        // compare with flipped operands and negate the result (emitting one more
-                        // instruction).
-                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst));
-                        // Emit all 1s into the `tmp` register.
-                        let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
-                        // Invert the result of the `PCMPGT*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
-                    }
-                    IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
-                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
-                        // Emit all 1s into the `tmp` register.
-                        let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
-                        // Invert the result of the `PCMPEQ*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
-                    }
-                    IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
-                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
-                    }
-                    _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
-                }
+                implemented_in_isle(ctx);
            }
        }

--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -9,8 +9,10 @@ use regalloc::Writable;
 use super::{is_int_or_ref_ty, is_mergeable_load, lower_to_amode, Reg};
 use crate::{
    ir::{
-        condcodes::FloatCC, immediates::*, types::*, Inst, InstructionData, Opcode, TrapCode,
-        Value, ValueLabel, ValueList,
+        condcodes::{FloatCC, IntCC},
+        immediates::*,
+        types::*,
+        Inst, InstructionData, Opcode, TrapCode, Value, ValueLabel, ValueList,
    },
    isa::{
        settings::Flags,
@@ -512,6 +514,11 @@ where
            None
        }
    }
+
+    #[inline]
+    fn intcc_to_cc(&mut self, intcc: &IntCC) -> CC {
+        CC::from_intcc(*intcc)
+    }
 }

 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
 src/prelude.isle b2bc986bcbbbb77
-src/isa/x64/inst.isle cdd292107fb36cf
-src/isa/x64/lower.isle c049f7d36db0e0fb
+src/isa/x64/inst.isle 5644ccb29bff0b51
+src/isa/x64/lower.isle 51d6ce13a3e47bc4
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/filetests/filetests/isa/x64/i128.clif
+++ b/cranelift/filetests/filetests/isa/x64/i128.clif
@@ -241,20 +241,20 @@ block0(v0: i128, v1: i128):
 ;   Inst 5:   movq    %r14, 32(%rsp)
 ;   Inst 6:   movq    %rbx, 40(%rsp)
 ;   Inst 7:   movq    %r15, 48(%rsp)
-;   Inst 8:   cmpq    %rcx, %rsi
+;   Inst 8:   cmpq    %rdx, %rdi
 ;   Inst 9:   setz    %al
-;   Inst 10:   cmpq    %rdx, %rdi
+;   Inst 10:   cmpq    %rcx, %rsi
 ;   Inst 11:   setz    %r8b
-;   Inst 12:   andq    %rax, %r8
-;   Inst 13:   andq    $1, %r8
+;   Inst 12:   andq    %r8, %rax
+;   Inst 13:   andq    $1, %rax
 ;   Inst 14:   setnz   %al
 ;   Inst 15:   movq    %rax, rsp(0 + virtual offset)
-;   Inst 16:   cmpq    %rcx, %rsi
+;   Inst 16:   cmpq    %rdx, %rdi
 ;   Inst 17:   setnz   %al
-;   Inst 18:   cmpq    %rdx, %rdi
+;   Inst 18:   cmpq    %rcx, %rsi
 ;   Inst 19:   setnz   %r8b
-;   Inst 20:   orq     %rax, %r8
-;   Inst 21:   andq    $1, %r8
+;   Inst 20:   orq     %r8, %rax
+;   Inst 21:   andq    $1, %rax
 ;   Inst 22:   setnz   %r8b
 ;   Inst 23:   cmpq    %rcx, %rsi
 ;   Inst 24:   setl    %r9b
--- a/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-comparison-legalize.clif
@@ -55,14 +55,16 @@ block0(v0: i16x8, v1: i16x8):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 7)
+;   (instruction range: 0 .. 9)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   pminsw  %xmm1, %xmm0
-;   Inst 3:   pcmpeqw %xmm1, %xmm0
-;   Inst 4:   movq    %rbp, %rsp
-;   Inst 5:   popq    %rbp
-;   Inst 6:   ret
+;   Inst 2:   movdqa  %xmm1, %xmm2
+;   Inst 3:   movdqa  %xmm0, %xmm1
+;   Inst 4:   pmaxsw  %xmm2, %xmm1
+;   Inst 5:   pcmpeqw %xmm1, %xmm0
+;   Inst 6:   movq    %rbp, %rsp
+;   Inst 7:   popq    %rbp
+;   Inst 8:   ret
 ; }}

 function %icmp_uge_i8x16(i8x16, i8x16) -> b8x16 {
@@ -75,13 +77,15 @@ block0(v0: i8x16, v1: i8x16):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 7)
+;   (instruction range: 0 .. 9)
 ;   Inst 0:   pushq   %rbp
 ;   Inst 1:   movq    %rsp, %rbp
-;   Inst 2:   pminub  %xmm1, %xmm0
-;   Inst 3:   pcmpeqb %xmm1, %xmm0
-;   Inst 4:   movq    %rbp, %rsp
-;   Inst 5:   popq    %rbp
-;   Inst 6:   ret
+;   Inst 2:   movdqa  %xmm1, %xmm2
+;   Inst 3:   movdqa  %xmm0, %xmm1
+;   Inst 4:   pmaxub  %xmm2, %xmm1
+;   Inst 5:   pcmpeqb %xmm1, %xmm0
+;   Inst 6:   movq    %rbp, %rsp
+;   Inst 7:   popq    %rbp
+;   Inst 8:   ret
 ; }}