Add vector compare to 0 optims (#3887)

Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
2022-03-10 00:20:06 +00:00
parent 8b48ce7fb7
commit 13b9396931
10 changed files with 1748 additions and 162 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1158,6 +1158,24 @@
    (Cnt)
    ;; Compare bitwise equal to 0
    (Cmeq0)
    ;; Compare signed greater than or equal to 0
    (Cmge0)
    ;; Compare signed greater than 0
    (Cmgt0)
    ;; Compare signed less than or equal to 0
    (Cmle0)
    ;; Compare signed less than 0
    (Cmlt0)
    ;; Floating point compare equal to 0
    (Fcmeq0)
    ;; Floating point compare greater than or equal to 0
    (Fcmge0)
    ;; Floating point compare greater than 0
    (Fcmgt0)
    ;; Floating point compare less than or equal to 0
    (Fcmle0)
    ;; Floating point compare less than 0
    (Fcmlt0)
 ))
 ;; A vector widening operation with one argument.
@@ -1997,3 +2015,79 @@
        (value_regs
          (alu_rrr op ty x_lo y_lo)
          (alu_rrr op ty x_hi y_hi))))
 ;; Float vector compare helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Match 32 bit float 0 value
 (decl zero_value_f32 (Ieee32) Ieee32)
 (extern extractor zero_value_f32 zero_value_f32)
 ;; Match 64 bit float 0 value
 (decl zero_value_f64 (Ieee64) Ieee64)
 (extern extractor zero_value_f64 zero_value_f64)
 ;; Generate comparison to zero operator from input condition code
 (decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2)
 (extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op)
 (decl float_cc_cmp_zero_to_vec_misc_op_swap (FloatCC) VecMisc2)
 (extern constructor float_cc_cmp_zero_to_vec_misc_op_swap float_cc_cmp_zero_to_vec_misc_op_swap)
 ;; Match valid generic compare to zero cases
 (decl fcmp_zero_cond (FloatCC) FloatCC)
 (extern extractor fcmp_zero_cond fcmp_zero_cond)
 ;; Match not equal compare to zero separately as it requires two output instructions
 (decl fcmp_zero_cond_not_eq (FloatCC) FloatCC)
 (extern extractor fcmp_zero_cond_not_eq fcmp_zero_cond_not_eq)
 ;; Helper for generating float compare to zero instructions where 2nd argument is zero
 (decl float_cmp_zero (FloatCC Reg VectorSize) Reg)
 (rule (float_cmp_zero cond rn size)
      (vec_misc (float_cc_cmp_zero_to_vec_misc_op cond) rn size))
 ;; Helper for generating float compare to zero instructions in case where 1st argument is zero
 (decl float_cmp_zero_swap (FloatCC Reg VectorSize) Reg)
 (rule (float_cmp_zero_swap cond rn size)
      (vec_misc (float_cc_cmp_zero_to_vec_misc_op_swap cond) rn size))
 ;; Helper for generating float compare equal to zero instruction
 (decl fcmeq0 (Reg VectorSize) Reg)
 (rule (fcmeq0 rn size)
      (vec_misc (VecMisc2.Fcmeq0) rn size))
 ;; Int vector compare helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Match integer 0 value
 (decl zero_value (Imm64) Imm64)
 (extern extractor zero_value zero_value)
 ;; Generate comparison to zero operator from input condition code
 (decl int_cc_cmp_zero_to_vec_misc_op (IntCC) VecMisc2)
 (extern constructor int_cc_cmp_zero_to_vec_misc_op int_cc_cmp_zero_to_vec_misc_op)
 (decl int_cc_cmp_zero_to_vec_misc_op_swap (IntCC) VecMisc2)
 (extern constructor int_cc_cmp_zero_to_vec_misc_op_swap int_cc_cmp_zero_to_vec_misc_op_swap)
 ;; Match valid generic compare to zero cases
 (decl icmp_zero_cond (IntCC) IntCC)
 (extern extractor icmp_zero_cond icmp_zero_cond)
 ;; Match not equal compare to zero separately as it requires two output instructions
 (decl icmp_zero_cond_not_eq (IntCC) IntCC)
 (extern extractor icmp_zero_cond_not_eq icmp_zero_cond_not_eq)
 ;; Helper for generating int compare to zero instructions where 2nd argument is zero
 (decl int_cmp_zero (IntCC Reg VectorSize) Reg)
 (rule (int_cmp_zero cond rn size)
      (vec_misc (int_cc_cmp_zero_to_vec_misc_op cond) rn size))
 ;; Helper for generating int compare to zero instructions in case where 1st argument is zero
 (decl int_cmp_zero_swap (IntCC Reg VectorSize) Reg)
 (rule (int_cmp_zero_swap cond rn size)
      (vec_misc (int_cc_cmp_zero_to_vec_misc_op_swap cond) rn size))
 ;; Helper for generating int compare equal to zero instruction
 (decl cmeq0 (Reg VectorSize) Reg)
 (rule (cmeq0 rn size)
      (vec_misc (VecMisc2.Cmeq0) rn size))
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1765,6 +1765,50 @@ impl MachInstEmit for Inst {
                        (0b0, 0b00101, enc_size)
                    }
                    VecMisc2::Cmeq0 => (0b0, 0b01001, enc_size),
                    VecMisc2::Cmge0 => (0b1, 0b01000, enc_size),
                    VecMisc2::Cmgt0 => (0b0, 0b01000, enc_size),
                    VecMisc2::Cmle0 => (0b1, 0b01001, enc_size),
                    VecMisc2::Cmlt0 => (0b0, 0b01010, enc_size),
                    VecMisc2::Fcmeq0 => {
                        debug_assert!(
                            size == VectorSize::Size32x2
                                || size == VectorSize::Size32x4
                                || size == VectorSize::Size64x2
                        );
                        (0b0, 0b01101, enc_size)
                    }
                    VecMisc2::Fcmge0 => {
                        debug_assert!(
                            size == VectorSize::Size32x2
                                || size == VectorSize::Size32x4
                                || size == VectorSize::Size64x2
                        );
                        (0b1, 0b01100, enc_size)
                    }
                    VecMisc2::Fcmgt0 => {
                        debug_assert!(
                            size == VectorSize::Size32x2
                                || size == VectorSize::Size32x4
                                || size == VectorSize::Size64x2
                        );
                        (0b0, 0b01100, enc_size)
                    }
                    VecMisc2::Fcmle0 => {
                        debug_assert!(
                            size == VectorSize::Size32x2
                                || size == VectorSize::Size32x4
                                || size == VectorSize::Size64x2
                        );
                        (0b1, 0b01101, enc_size)
                    }
                    VecMisc2::Fcmlt0 => {
                        debug_assert!(
                            size == VectorSize::Size32x2
                                || size == VectorSize::Size32x4
                                || size == VectorSize::Size64x2
                        );
                        (0b0, 0b01110, enc_size)
                    }
                };
                sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
            }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -4518,15 +4518,114 @@ fn test_aarch64_binemit() {
        "cnt v23.8b, v5.8b",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcmeq0,
            rd: writable_vreg(5),
            rn: vreg(2),
            size: VectorSize::Size32x4,
        },
        "45D8A04E",
        "fcmeq v5.4s, v2.4s, #0.0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcmge0,
            rd: writable_vreg(3),
            rn: vreg(1),
            size: VectorSize::Size64x2,
        },
        "23C8E06E",
        "fcmge v3.2d, v1.2d, #0.0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcmgt0,
            rd: writable_vreg(5),
            rn: vreg(7),
            size: VectorSize::Size32x4,
        },
        "E5C8A04E",
        "fcmgt v5.4s, v7.4s, #0.0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcmle0,
            rd: writable_vreg(10),
            rn: vreg(2),
            size: VectorSize::Size32x4,
        },
        "4AD8A06E",
        "fcmle v10.4s, v2.4s, #0.0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Fcmlt0,
            rd: writable_vreg(12),
            rn: vreg(12),
            size: VectorSize::Size64x2,
        },
        "8CE9E04E",
        "fcmlt v12.2d, v12.2d, #0.0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Cmeq0,
            rd: writable_vreg(22),
            rn: vreg(27),
            size: VectorSize::Size16x8,
        },
        "769B604E",
        "cmeq v22.8h, v27.8h, #0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Cmge0,
            rd: writable_vreg(12),
            rn: vreg(27),
            size: VectorSize::Size16x8,
        },
-        "6C9B604E",
+        "6C8B606E",
-        "cmeq v12.8h, v27.8h, #0",
+        "cmge v12.8h, v27.8h, #0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Cmgt0,
            rd: writable_vreg(12),
            rn: vreg(27),
            size: VectorSize::Size8x16,
        },
        "6C8B204E",
        "cmgt v12.16b, v27.16b, #0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Cmle0,
            rd: writable_vreg(1),
            rn: vreg(27),
            size: VectorSize::Size32x4,
        },
        "619BA06E",
        "cmle v1.4s, v27.4s, #0",
    ));
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Cmlt0,
            rd: writable_vreg(0),
            rn: vreg(7),
            size: VectorSize::Size64x2,
        },
        "E0A8E04E",
        "cmlt v0.2d, v7.2d, #0",
    ));
    insns.push((
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -3175,6 +3175,15 @@ impl Inst {
                    VecMisc2::Frintp => ("frintp", size, ""),
                    VecMisc2::Cnt => ("cnt", size, ""),
                    VecMisc2::Cmeq0 => ("cmeq", size, ", #0"),
                    VecMisc2::Cmge0 => ("cmge", size, ", #0"),
                    VecMisc2::Cmgt0 => ("cmgt", size, ", #0"),
                    VecMisc2::Cmle0 => ("cmle", size, ", #0"),
                    VecMisc2::Cmlt0 => ("cmlt", size, ", #0"),
                    VecMisc2::Fcmeq0 => ("fcmeq", size, ", #0.0"),
                    VecMisc2::Fcmge0 => ("fcmge", size, ", #0.0"),
                    VecMisc2::Fcmgt0 => ("fcmgt", size, ", #0.0"),
                    VecMisc2::Fcmle0 => ("fcmle", size, ", #0.0"),
                    VecMisc2::Fcmlt0 => ("fcmlt", size, ", #0.0"),
                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
                let rn = show_vreg_vector(rn, mb_rru, size);
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1124,3 +1124,69 @@
 (rule (lower (has_type $I8X16 (popcnt x)))
      (vec_cnt x (VectorSize.Size8x16)))
 ;;;; Rules for `fcmp` 32 bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x (splat (f32const (zero_value_f32 y))))))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x (splat (f32const (zero_value_f32 y))))))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (float_cmp_zero cond rn vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) (splat (f32const (zero_value_f32 x))) y)))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) (splat (f32const (zero_value_f32 x))) y)))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (float_cmp_zero_swap cond rn vec_size))))
 ;;;; Rules for `fcmp` 64 bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x (splat (f64const (zero_value_f64 y))))))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x (splat (f64const (zero_value_f64 y))))))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (float_cmp_zero cond rn vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) (splat (f64const (zero_value_f64 x))) y)))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) (splat (f64const (zero_value_f64 x))) y)))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (float_cmp_zero_swap cond rn vec_size))))
 ;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x (splat (iconst (zero_value y))))))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (cmeq0 rn vec_size) vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x (splat (iconst (zero_value y))))))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (int_cmp_zero cond rn vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) (splat (iconst (zero_value x))) y)))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (cmeq0 rn vec_size) vec_size))))
 (rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) (splat (iconst (zero_value x))) y)))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (int_cmp_zero_swap cond rn vec_size))))
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -6,9 +6,9 @@ pub mod generated_code;
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
    writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, AtomicRmwOp, BranchTarget,
-    CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, Imm12, ImmLogic, ImmShift,
+    CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
-    Inst as MInst, JTSequenceInfo, MachLabel, MoveWideConst, NarrowValueMode, Opcode, OperandSize,
+    Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, NarrowValueMode, Opcode,
-    PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VectorSize, NZCV,
+    OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
 };
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::isle::*;
@@ -286,4 +286,105 @@ where
        let amount = val.value() & u8::try_from(ty.bits() - 1).unwrap();
        ImmShift::maybe_from_u64(u64::from(ty.bits()) - u64::from(amount)).unwrap()
    }
    fn icmp_zero_cond(&mut self, cond: &IntCC) -> Option<IntCC> {
        match cond {
            &IntCC::Equal
            | &IntCC::SignedGreaterThanOrEqual
            | &IntCC::SignedGreaterThan
            | &IntCC::SignedLessThanOrEqual
            | &IntCC::SignedLessThan => Some(*cond),
            _ => None,
        }
    }
    fn fcmp_zero_cond(&mut self, cond: &FloatCC) -> Option<FloatCC> {
        match cond {
            &FloatCC::Equal
            | &FloatCC::GreaterThanOrEqual
            | &FloatCC::GreaterThan
            | &FloatCC::LessThanOrEqual
            | &FloatCC::LessThan => Some(*cond),
            _ => None,
        }
    }
    fn fcmp_zero_cond_not_eq(&mut self, cond: &FloatCC) -> Option<FloatCC> {
        match cond {
            &FloatCC::NotEqual => Some(FloatCC::NotEqual),
            _ => None,
        }
    }
    fn icmp_zero_cond_not_eq(&mut self, cond: &IntCC) -> Option<IntCC> {
        match cond {
            &IntCC::NotEqual => Some(IntCC::NotEqual),
            _ => None,
        }
    }
    fn float_cc_cmp_zero_to_vec_misc_op(&mut self, cond: &FloatCC) -> VecMisc2 {
        match cond {
            &FloatCC::Equal => VecMisc2::Fcmeq0,
            &FloatCC::GreaterThanOrEqual => VecMisc2::Fcmge0,
            &FloatCC::LessThanOrEqual => VecMisc2::Fcmle0,
            &FloatCC::GreaterThan => VecMisc2::Fcmgt0,
            &FloatCC::LessThan => VecMisc2::Fcmlt0,
            _ => panic!(),
        }
    }
    fn int_cc_cmp_zero_to_vec_misc_op(&mut self, cond: &IntCC) -> VecMisc2 {
        match cond {
            &IntCC::Equal => VecMisc2::Cmeq0,
            &IntCC::SignedGreaterThanOrEqual => VecMisc2::Cmge0,
            &IntCC::SignedLessThanOrEqual => VecMisc2::Cmle0,
            &IntCC::SignedGreaterThan => VecMisc2::Cmgt0,
            &IntCC::SignedLessThan => VecMisc2::Cmlt0,
            _ => panic!(),
        }
    }
    fn float_cc_cmp_zero_to_vec_misc_op_swap(&mut self, cond: &FloatCC) -> VecMisc2 {
        match cond {
            &FloatCC::Equal => VecMisc2::Fcmeq0,
            &FloatCC::GreaterThanOrEqual => VecMisc2::Fcmle0,
            &FloatCC::LessThanOrEqual => VecMisc2::Fcmge0,
            &FloatCC::GreaterThan => VecMisc2::Fcmlt0,
            &FloatCC::LessThan => VecMisc2::Fcmgt0,
            _ => panic!(),
        }
    }
    fn int_cc_cmp_zero_to_vec_misc_op_swap(&mut self, cond: &IntCC) -> VecMisc2 {
        match cond {
            &IntCC::Equal => VecMisc2::Cmeq0,
            &IntCC::SignedGreaterThanOrEqual => VecMisc2::Cmle0,
            &IntCC::SignedLessThanOrEqual => VecMisc2::Cmge0,
            &IntCC::SignedGreaterThan => VecMisc2::Cmlt0,
            &IntCC::SignedLessThan => VecMisc2::Cmgt0,
            _ => panic!(),
        }
    }
    fn zero_value(&mut self, value: Imm64) -> Option<Imm64> {
        if value.bits() == 0 {
            return Some(value);
        }
        None
    }
    fn zero_value_f32(&mut self, value: Ieee32) -> Option<Ieee32> {
        if value.bits() == 0 {
            return Some(value);
        }
        None
    }
    fn zero_value_f64(&mut self, value: Ieee64) -> Option<Ieee64> {
        if value.bits() == 0 {
            return Some(value);
        }
        None
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
 src/prelude.isle b2bc986bcbbbb77
-src/isa/aarch64/inst.isle 3678d0a37bdb4cff
+src/isa/aarch64/inst.isle 19ccefb6a496d392
-src/isa/aarch64/lower.isle 90accbfcadaea46d
+src/isa/aarch64/lower.isle 90ead921762336d2
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
--- a/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif
@@ -0,0 +1,415 @@
 test compile precise-output
 set unwind_info=false
 target aarch64
 function %f0(i8x16) -> b8x16 {
 block0(v0: i8x16):
  v1 = iconst.i8 0
  v2 = splat.i8x16 v1
  v3 = icmp eq v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmeq v0.16b, v0.16b, #0
 ;   Inst 1:   ret
 ; }}
 function %f1(i16x8) -> b16x8 {
 block0(v0: i16x8):
  v1 = iconst.i16 0
  v2 = splat.i16x8 v1
  v3 = icmp eq v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmeq v0.8h, v0.8h, #0
 ;   Inst 1:   ret
 ; }}
 function %f2(i32x4) -> b32x4 {
 block0(v0: i32x4):
  v1 = iconst.i32 0
  v2 = splat.i32x4 v1
  v3 = icmp ne v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 3)
 ;   Inst 0:   cmeq v0.4s, v0.4s, #0
 ;   Inst 1:   mvn v0.16b, v0.16b
 ;   Inst 2:   ret
 ; }}
 function %f3(i64x2) -> b64x2 {
 block0(v0: i64x2):
  v1 = iconst.i64 0
  v2 = splat.i64x2 v1
  v3 = icmp ne v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 3)
 ;   Inst 0:   cmeq v0.2d, v0.2d, #0
 ;   Inst 1:   mvn v0.16b, v0.16b
 ;   Inst 2:   ret
 ; }}
 function %f4(i8x16) -> b8x16 {
 block0(v0: i8x16):
  v1 = iconst.i8 0
  v2 = splat.i8x16 v1
  v3 = icmp sle v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmle v0.16b, v0.16b, #0
 ;   Inst 1:   ret
 ; }}
 function %f5(i16x8) -> b16x8 {
 block0(v0: i16x8):
  v1 = iconst.i16 0
  v2 = splat.i16x8 v1
  v3 = icmp sle v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmge v0.8h, v0.8h, #0
 ;   Inst 1:   ret
 ; }}
 function %f6(i32x4) -> b32x4 {
 block0(v0: i32x4):
  v1 = iconst.i32 0
  v2 = splat.i32x4 v1
  v3 = icmp sge v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmge v0.4s, v0.4s, #0
 ;   Inst 1:   ret
 ; }}
 function %f7(i64x2) -> b64x2 {
 block0(v0: i64x2):
  v1 = iconst.i64 0
  v2 = splat.i64x2 v1
  v3 = icmp sge v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmle v0.2d, v0.2d, #0
 ;   Inst 1:   ret
 ; }}
 function %f8(i8x16) -> b8x16 {
 block0(v0: i8x16):
  v1 = iconst.i8 0
  v2 = splat.i8x16 v1
  v3 = icmp slt v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmlt v0.16b, v0.16b, #0
 ;   Inst 1:   ret
 ; }}
 function %f9(i16x8) -> b16x8 {
 block0(v0: i16x8):
  v1 = iconst.i16 0
  v2 = splat.i16x8 v1
  v3 = icmp slt v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmgt v0.8h, v0.8h, #0
 ;   Inst 1:   ret
 ; }}
 function %f10(i32x4) -> b32x4 {
 block0(v0: i32x4):
  v1 = iconst.i32 0
  v2 = splat.i32x4 v1
  v3 = icmp sgt v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmgt v0.4s, v0.4s, #0
 ;   Inst 1:   ret
 ; }}
 function %f11(i64x2) -> b64x2 {
 block0(v0: i64x2):
  v1 = iconst.i64 0
  v2 = splat.i64x2 v1
  v3 = icmp sgt v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   cmlt v0.2d, v0.2d, #0
 ;   Inst 1:   ret
 ; }}
 function %f12(f32x4) -> b32x4 {
 block0(v0: f32x4):
  v1 = f32const 0.0
  v2 = splat.f32x4 v1
  v3 = fcmp eq v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmeq v0.4s, v0.4s, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f13(f64x2) -> b64x2 {
 block0(v0: f64x2):
  v1 = f64const 0.0
  v2 = splat.f64x2 v1
  v3 = fcmp eq v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmeq v0.2d, v0.2d, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f14(f64x2) -> b64x2 {
 block0(v0: f64x2):
  v1 = f64const 0.0
  v2 = splat.f64x2 v1
  v3 = fcmp ne v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 3)
 ;   Inst 0:   fcmeq v0.2d, v0.2d, #0.0
 ;   Inst 1:   mvn v0.16b, v0.16b
 ;   Inst 2:   ret
 ; }}
 function %f15(f32x4) -> b32x4 {
 block0(v0: f32x4):
  v1 = f32const 0.0
  v2 = splat.f32x4 v1
  v3 = fcmp ne v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 3)
 ;   Inst 0:   fcmeq v0.4s, v0.4s, #0.0
 ;   Inst 1:   mvn v0.16b, v0.16b
 ;   Inst 2:   ret
 ; }}
 function %f16(f32x4) -> b32x4 {
 block0(v0: f32x4):
  v1 = f32const 0.0
  v2 = splat.f32x4 v1
  v3 = fcmp le v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmle v0.4s, v0.4s, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f17(f64x2) -> b64x2 {
 block0(v0: f64x2):
  v1 = f64const 0.0
  v2 = splat.f64x2 v1
  v3 = fcmp le v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmge v0.2d, v0.2d, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f18(f64x2) -> b64x2 {
 block0(v0: f64x2):
  v1 = f64const 0.0
  v2 = splat.f64x2 v1
  v3 = fcmp ge v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmge v0.2d, v0.2d, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f19(f32x4) -> b32x4 {
 block0(v0: f32x4):
  v1 = f32const 0.0
  v2 = splat.f32x4 v1
  v3 = fcmp ge v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmle v0.4s, v0.4s, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f20(f32x4) -> b32x4 {
 block0(v0: f32x4):
  v1 = f32const 0.0
  v2 = splat.f32x4 v1
  v3 = fcmp lt v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmlt v0.4s, v0.4s, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f21(f64x2) -> b64x2 {
 block0(v0: f64x2):
  v1 = f64const 0.0
  v2 = splat.f64x2 v1
  v3 = fcmp lt v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmgt v0.2d, v0.2d, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f22(f64x2) -> b64x2 {
 block0(v0: f64x2):
  v1 = f64const 0.0
  v2 = splat.f64x2 v1
  v3 = fcmp gt v0, v2
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmgt v0.2d, v0.2d, #0.0
 ;   Inst 1:   ret
 ; }}
 function %f23(f32x4) -> b32x4 {
 block0(v0: f32x4):
  v1 = f32const 0.0
  v2 = splat.f32x4 v1
  v3 = fcmp gt v2, v0
  return v3
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fcmlt v0.4s, v0.4s, #0.0
 ;   Inst 1:   ret
 ; }}
--- a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif
+++ b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif
@@ -0,0 +1,255 @@
 test run
 target aarch64
 ; raw_bitcast is needed to get around issue with "bint" on aarch64
 function %simd_icmp_eq_i8(i8x16) -> i8x16 {
 block0(v0: i8x16):
    v1 = iconst.i8 0
    v3 = splat.i8x16 v1
    v2 = icmp eq v0, v3
    v4 = raw_bitcast.i8x16 v2
    return v4
 }
 ; run: %simd_icmp_eq_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0 0xff 0 0 0 0xff 0 0 0 0xff 0 0 0 0xff 0 0]
 function %simd_icmp_ne_i16(i16x8) -> i16x8 {
 block0(v0: i16x8):
    v1 = iconst.i16 0
    v3 = splat.i16x8 v1
    v2 = icmp ne v0, v3
    v4 = raw_bitcast.i16x8 v2
    return v4
 }
 ; run: %simd_icmp_ne_i16([-1 0 1 100 -1 0 1 100]) == [0xffff 0 0xffff 0xffff 0xffff 0 0xffff 0xffff]
 function %simd_icmp_le_i32(i32x4) -> i32x4 {
 block0(v0: i32x4):
    v1 = iconst.i32 0
    v3 = splat.i32x4 v1
    v2 = icmp sle v0, v3
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_icmp_le_i32([-1 0 1 100]) == [0xffffffff 0xffffffff 0 0]
 function %simd_icmp_ge_i64(i64x2) -> i64x2 {
 block0(v0: i64x2):
    v1 = iconst.i64 0
    v3 = splat.i64x2 v1
    v2 = icmp sge v0, v3
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_icmp_ge_i64([-1 0]) == [0 0xffffffffffffffff]
 ; run: %simd_icmp_ge_i64([1 100]) == [0xffffffffffffffff 0xffffffffffffffff]
 function %simd_icmp_lt_i8(i8x16) -> i8x16 {
 block0(v0: i8x16):
    v1 = iconst.i8 0
    v3 = splat.i8x16 v1
    v2 = icmp slt v0, v3
    v4 = raw_bitcast.i8x16 v2
    return v4
 }
 ; run: %simd_icmp_lt_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0xff 0 0 0 0xff 0 0 0 0xff 0 0 0 0xff 0 0 0]
 function %simd_icmp_gt_i16(i16x8) -> i16x8 {
 block0(v0: i16x8):
    v1 = iconst.i16 0
    v3 = splat.i16x8 v1
    v2 = icmp sgt v0, v3
    v4 = raw_bitcast.i16x8 v2
    return v4
 }
 ; run: %simd_icmp_gt_i16([-1 0 1 100 -1 0 1 100]) == [0 0 0xffff 0xffff 0 0 0xffff 0xffff]
 function %simd_fcmp_eq_f32(f32x4) -> i32x4 {
 block0(v0: f32x4):
    v1 = f32const 0.0
    v3 = splat.f32x4 v1
    v2 = fcmp eq v0, v3
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_fcmp_eq_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0 0xffffffff 0 0]
 function %simd_fcmp_ne_f64(f64x2) -> i64x2 {
 block0(v0: f64x2):
    v1 = f64const 0.0
    v3 = splat.f64x2 v1
    v2 = fcmp ne v0, v3
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_fcmp_ne_f64([-0x1.0 0x0.0]) == [0xffffffffffffffff 0]
 ; run: %simd_fcmp_ne_f64([0x1.0 NaN]) == [0xffffffffffffffff 0xffffffffffffffff]
 function %simd_fcmp_le_f32(f32x4) -> i32x4 {
 block0(v0: f32x4):
    v1 = f32const 0.0
    v3 = splat.f32x4 v1
    v2 = fcmp le v0, v3
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_fcmp_le_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0xffffffff 0 0]
 function %simd_fcmp_ge_f64(f64x2) -> i64x2 {
 block0(v0: f64x2):
    v1 = f64const 0.0
    v3 = splat.f64x2 v1
    v2 = fcmp ge v0, v3
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_fcmp_ge_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
 ; run: %simd_fcmp_ge_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
 function %simd_fcmp_lt_f32(f32x4) -> i32x4 {
 block0(v0: f32x4):
    v1 = f32const 0.0
    v3 = splat.f32x4 v1
    v2 = fcmp lt v0, v3
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_fcmp_lt_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0 0]
 function %simd_fcmp_gt_f64(f64x2) -> i64x2 {
 block0(v0: f64x2):
    v1 = f64const 0.0
    v3 = splat.f64x2 v1
    v2 = fcmp gt v0, v3
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_fcmp_gt_f64([-0x1.0 0x0.0]) == [0 0]
 ; run: %simd_fcmp_gt_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
 function %simd_icmp_eq_i32(i32x4) -> i32x4 {
 block0(v0: i32x4):
    v1 = iconst.i32 0
    v3 = splat.i32x4 v1
    v2 = icmp eq v3, v0
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_icmp_eq_i32([1 0 -1 100]) == [0 0xffffffff 0 0]
 function %simd_icmp_ne_i64(i64x2) -> i64x2 {
 block0(v0: i64x2):
    v1 = iconst.i64 0
    v3 = splat.i64x2 v1
    v2 = icmp ne v3, v0
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_icmp_ne_i64([-1 0]) == [0xffffffffffffffff 0]
 ; run: %simd_icmp_ne_i64([1 100]) == [0xffffffffffffffff 0xffffffffffffffff]
 function %simd_icmp_le_i8(i8x16) -> i8x16 {
 block0(v0: i8x16):
    v1 = iconst.i8 0
    v3 = splat.i8x16 v1
    v2 = icmp sle v3, v0
    v4 = raw_bitcast.i8x16 v2
    return v4
 }
 ; run: %simd_icmp_le_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0 0xff 0xff 0xff 0 0xff 0xff 0xff 0 0xff 0xff 0xff 0 0xff 0xff 0xff]
 function %simd_icmp_ge_i16(i16x8) -> i16x8 {
 block0(v0: i16x8):
    v1 = iconst.i16 0
    v3 = splat.i16x8 v1
    v2 = icmp sge v3, v0
    v4 = raw_bitcast.i16x8 v2
    return v4
 }
 ; run: %simd_icmp_ge_i16([-1 0 1 100 -1 0 1 100]) == [0xffff 0xffff 0 0 0xffff 0xffff 0 0]
 function %simd_icmp_lt_i32(i32x4) -> i32x4 {
 block0(v0: i32x4):
    v1 = iconst.i32 0
    v3 = splat.i32x4 v1
    v2 = icmp slt v3, v0
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_icmp_lt_i32([-1 0 1 100]) == [0 0 0xffffffff 0xffffffff]
 function %simd_icmp_gt_i64(i64x2) -> i64x2 {
 block0(v0: i64x2):
    v1 = iconst.i64 0
    v3 = splat.i64x2 v1
    v2 = icmp sgt v3, v0
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_icmp_gt_i64([-1 0]) == [0xffffffffffffffff 0]
 ; run: %simd_icmp_gt_i64([1 100]) == [0 0]
 function %simd_fcmp_eq_f64(f64x2) -> i64x2 {
 block0(v0: f64x2):
    v1 = f64const 0.0
    v3 = splat.f64x2 v1
    v2 = fcmp eq v3, v0
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_fcmp_eq_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
 ; run: %simd_fcmp_eq_f64([0x1.0 NaN]) == [0 0]
 function %simd_fcmp_ne_f32(f32x4) -> i32x4 {
 block0(v0: f32x4):
    v1 = f32const 0.0
    v3 = splat.f32x4 v1
    v2 = fcmp ne v3, v0
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_fcmp_ne_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0xffffffff 0xffffffff]
 function %simd_fcmp_le_f64(f64x2) -> i64x2 {
 block0(v0: f64x2):
    v1 = f64const 0.0
    v3 = splat.f64x2 v1
    v2 = fcmp le v3, v0
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_fcmp_le_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
 ; run: %simd_fcmp_le_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
 function %simd_fcmp_ge_f32(f32x4) -> i32x4 {
 block0(v0: f32x4):
    v1 = f32const 0.0
    v3 = splat.f32x4 v1
    v2 = fcmp ge v3, v0
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_fcmp_ge_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0xffffffff 0 0]
 function %simd_fcmp_lt_f64(f64x2) -> i64x2 {
 block0(v0: f64x2):
    v1 = f64const 0.0
    v3 = splat.f64x2 v1
    v2 = fcmp lt v3, v0
    v4 = raw_bitcast.i64x2 v2
    return v4
 }
 ; run: %simd_fcmp_lt_f64([-0x1.0 0x0.0]) == [0 0]
 ; run: %simd_fcmp_lt_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
 function %simd_fcmp_gt_f32(f32x4) -> i32x4 {
 block0(v0: f32x4):
    v1 = f32const 0.0
    v3 = splat.f32x4 v1
    v2 = fcmp gt v3, v0
    v4 = raw_bitcast.i32x4 v2
    return v4
 }
 ; run: %simd_fcmp_gt_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0 0]