Add vector compare to 0 optims (#3887)

Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
2022-03-10 00:20:06 +00:00
parent 8b48ce7fb7
commit 13b9396931
10 changed files with 1748 additions and 162 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1158,6 +1158,24 @@
    (Cnt)
    ;; Compare bitwise equal to 0
    (Cmeq0)
+    ;; Compare signed greater than or equal to 0
+    (Cmge0)
+    ;; Compare signed greater than 0
+    (Cmgt0)
+    ;; Compare signed less than or equal to 0
+    (Cmle0)
+    ;; Compare signed less than 0
+    (Cmlt0)
+    ;; Floating point compare equal to 0
+    (Fcmeq0)
+    ;; Floating point compare greater than or equal to 0
+    (Fcmge0)
+    ;; Floating point compare greater than 0
+    (Fcmgt0)
+    ;; Floating point compare less than or equal to 0
+    (Fcmle0)
+    ;; Floating point compare less than 0
+    (Fcmlt0)
 ))

 ;; A vector widening operation with one argument.
@@ -1997,3 +2015,79 @@
        (value_regs
          (alu_rrr op ty x_lo y_lo)
          (alu_rrr op ty x_hi y_hi))))
+
+;; Float vector compare helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Match 32 bit float 0 value
+(decl zero_value_f32 (Ieee32) Ieee32)
+(extern extractor zero_value_f32 zero_value_f32)
+
+;; Match 64 bit float 0 value
+(decl zero_value_f64 (Ieee64) Ieee64)
+(extern extractor zero_value_f64 zero_value_f64)
+
+;; Generate comparison to zero operator from input condition code
+(decl float_cc_cmp_zero_to_vec_misc_op (FloatCC) VecMisc2)
+(extern constructor float_cc_cmp_zero_to_vec_misc_op float_cc_cmp_zero_to_vec_misc_op)
+
+(decl float_cc_cmp_zero_to_vec_misc_op_swap (FloatCC) VecMisc2)
+(extern constructor float_cc_cmp_zero_to_vec_misc_op_swap float_cc_cmp_zero_to_vec_misc_op_swap)
+
+;; Match valid generic compare to zero cases
+(decl fcmp_zero_cond (FloatCC) FloatCC)
+(extern extractor fcmp_zero_cond fcmp_zero_cond)
+
+;; Match not equal compare to zero separately as it requires two output instructions
+(decl fcmp_zero_cond_not_eq (FloatCC) FloatCC)
+(extern extractor fcmp_zero_cond_not_eq fcmp_zero_cond_not_eq)
+
+;; Helper for generating float compare to zero instructions where 2nd argument is zero
+(decl float_cmp_zero (FloatCC Reg VectorSize) Reg)
+(rule (float_cmp_zero cond rn size)
+      (vec_misc (float_cc_cmp_zero_to_vec_misc_op cond) rn size))
+
+;; Helper for generating float compare to zero instructions in case where 1st argument is zero
+(decl float_cmp_zero_swap (FloatCC Reg VectorSize) Reg)
+(rule (float_cmp_zero_swap cond rn size)
+      (vec_misc (float_cc_cmp_zero_to_vec_misc_op_swap cond) rn size))
+
+;; Helper for generating float compare equal to zero instruction
+(decl fcmeq0 (Reg VectorSize) Reg)
+(rule (fcmeq0 rn size)
+      (vec_misc (VecMisc2.Fcmeq0) rn size))
+
+;; Int vector compare helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; Match integer 0 value
+(decl zero_value (Imm64) Imm64)
+(extern extractor zero_value zero_value)
+
+;; Generate comparison to zero operator from input condition code
+(decl int_cc_cmp_zero_to_vec_misc_op (IntCC) VecMisc2)
+(extern constructor int_cc_cmp_zero_to_vec_misc_op int_cc_cmp_zero_to_vec_misc_op)
+
+(decl int_cc_cmp_zero_to_vec_misc_op_swap (IntCC) VecMisc2)
+(extern constructor int_cc_cmp_zero_to_vec_misc_op_swap int_cc_cmp_zero_to_vec_misc_op_swap)
+
+;; Match valid generic compare to zero cases
+(decl icmp_zero_cond (IntCC) IntCC)
+(extern extractor icmp_zero_cond icmp_zero_cond)
+
+;; Match not equal compare to zero separately as it requires two output instructions
+(decl icmp_zero_cond_not_eq (IntCC) IntCC)
+(extern extractor icmp_zero_cond_not_eq icmp_zero_cond_not_eq)
+
+;; Helper for generating int compare to zero instructions where 2nd argument is zero
+(decl int_cmp_zero (IntCC Reg VectorSize) Reg)
+(rule (int_cmp_zero cond rn size)
+      (vec_misc (int_cc_cmp_zero_to_vec_misc_op cond) rn size))
+
+;; Helper for generating int compare to zero instructions in case where 1st argument is zero
+(decl int_cmp_zero_swap (IntCC Reg VectorSize) Reg)
+(rule (int_cmp_zero_swap cond rn size)
+      (vec_misc (int_cc_cmp_zero_to_vec_misc_op_swap cond) rn size))
+
+;; Helper for generating int compare equal to zero instruction
+(decl cmeq0 (Reg VectorSize) Reg)
+(rule (cmeq0 rn size)
+      (vec_misc (VecMisc2.Cmeq0) rn size))
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1765,6 +1765,50 @@ impl MachInstEmit for Inst {
                        (0b0, 0b00101, enc_size)
                    }
                    VecMisc2::Cmeq0 => (0b0, 0b01001, enc_size),
+                    VecMisc2::Cmge0 => (0b1, 0b01000, enc_size),
+                    VecMisc2::Cmgt0 => (0b0, 0b01000, enc_size),
+                    VecMisc2::Cmle0 => (0b1, 0b01001, enc_size),
+                    VecMisc2::Cmlt0 => (0b0, 0b01010, enc_size),
+                    VecMisc2::Fcmeq0 => {
+                        debug_assert!(
+                            size == VectorSize::Size32x2
+                                || size == VectorSize::Size32x4
+                                || size == VectorSize::Size64x2
+                        );
+                        (0b0, 0b01101, enc_size)
+                    }
+                    VecMisc2::Fcmge0 => {
+                        debug_assert!(
+                            size == VectorSize::Size32x2
+                                || size == VectorSize::Size32x4
+                                || size == VectorSize::Size64x2
+                        );
+                        (0b1, 0b01100, enc_size)
+                    }
+                    VecMisc2::Fcmgt0 => {
+                        debug_assert!(
+                            size == VectorSize::Size32x2
+                                || size == VectorSize::Size32x4
+                                || size == VectorSize::Size64x2
+                        );
+                        (0b0, 0b01100, enc_size)
+                    }
+                    VecMisc2::Fcmle0 => {
+                        debug_assert!(
+                            size == VectorSize::Size32x2
+                                || size == VectorSize::Size32x4
+                                || size == VectorSize::Size64x2
+                        );
+                        (0b1, 0b01101, enc_size)
+                    }
+                    VecMisc2::Fcmlt0 => {
+                        debug_assert!(
+                            size == VectorSize::Size32x2
+                                || size == VectorSize::Size32x4
+                                || size == VectorSize::Size64x2
+                        );
+                        (0b0, 0b01110, enc_size)
+                    }
                };
                sink.put4(enc_vec_rr_misc((q << 1) | u, size, bits_12_16, rd, rn));
            }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -4518,15 +4518,114 @@ fn test_aarch64_binemit() {
        "cnt v23.8b, v5.8b",
    ));

+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcmeq0,
+            rd: writable_vreg(5),
+            rn: vreg(2),
+            size: VectorSize::Size32x4,
+        },
+        "45D8A04E",
+        "fcmeq v5.4s, v2.4s, #0.0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcmge0,
+            rd: writable_vreg(3),
+            rn: vreg(1),
+            size: VectorSize::Size64x2,
+        },
+        "23C8E06E",
+        "fcmge v3.2d, v1.2d, #0.0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcmgt0,
+            rd: writable_vreg(5),
+            rn: vreg(7),
+            size: VectorSize::Size32x4,
+        },
+        "E5C8A04E",
+        "fcmgt v5.4s, v7.4s, #0.0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcmle0,
+            rd: writable_vreg(10),
+            rn: vreg(2),
+            size: VectorSize::Size32x4,
+        },
+        "4AD8A06E",
+        "fcmle v10.4s, v2.4s, #0.0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Fcmlt0,
+            rd: writable_vreg(12),
+            rn: vreg(12),
+            size: VectorSize::Size64x2,
+        },
+        "8CE9E04E",
+        "fcmlt v12.2d, v12.2d, #0.0",
+    ));
+
    insns.push((
        Inst::VecMisc {
            op: VecMisc2::Cmeq0,
+            rd: writable_vreg(22),
+            rn: vreg(27),
+            size: VectorSize::Size16x8,
+        },
+        "769B604E",
+        "cmeq v22.8h, v27.8h, #0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Cmge0,
            rd: writable_vreg(12),
            rn: vreg(27),
            size: VectorSize::Size16x8,
        },
-        "6C9B604E",
-        "cmeq v12.8h, v27.8h, #0",
+        "6C8B606E",
+        "cmge v12.8h, v27.8h, #0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Cmgt0,
+            rd: writable_vreg(12),
+            rn: vreg(27),
+            size: VectorSize::Size8x16,
+        },
+        "6C8B204E",
+        "cmgt v12.16b, v27.16b, #0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Cmle0,
+            rd: writable_vreg(1),
+            rn: vreg(27),
+            size: VectorSize::Size32x4,
+        },
+        "619BA06E",
+        "cmle v1.4s, v27.4s, #0",
+    ));
+
+    insns.push((
+        Inst::VecMisc {
+            op: VecMisc2::Cmlt0,
+            rd: writable_vreg(0),
+            rn: vreg(7),
+            size: VectorSize::Size64x2,
+        },
+        "E0A8E04E",
+        "cmlt v0.2d, v7.2d, #0",
    ));

    insns.push((
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -3175,6 +3175,15 @@ impl Inst {
                    VecMisc2::Frintp => ("frintp", size, ""),
                    VecMisc2::Cnt => ("cnt", size, ""),
                    VecMisc2::Cmeq0 => ("cmeq", size, ", #0"),
+                    VecMisc2::Cmge0 => ("cmge", size, ", #0"),
+                    VecMisc2::Cmgt0 => ("cmgt", size, ", #0"),
+                    VecMisc2::Cmle0 => ("cmle", size, ", #0"),
+                    VecMisc2::Cmlt0 => ("cmlt", size, ", #0"),
+                    VecMisc2::Fcmeq0 => ("fcmeq", size, ", #0.0"),
+                    VecMisc2::Fcmge0 => ("fcmge", size, ", #0.0"),
+                    VecMisc2::Fcmgt0 => ("fcmgt", size, ", #0.0"),
+                    VecMisc2::Fcmle0 => ("fcmle", size, ", #0.0"),
+                    VecMisc2::Fcmlt0 => ("fcmlt", size, ", #0.0"),
                };
                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
                let rn = show_vreg_vector(rn, mb_rru, size);
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1124,3 +1124,69 @@

 (rule (lower (has_type $I8X16 (popcnt x)))
      (vec_cnt x (VectorSize.Size8x16)))
+
+;;;; Rules for `fcmp` 32 bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x (splat (f32const (zero_value_f32 y))))))
+      (let ((rn Reg x)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x (splat (f32const (zero_value_f32 y))))))
+      (let ((rn Reg x)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (float_cmp_zero cond rn vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) (splat (f32const (zero_value_f32 x))) y)))
+      (let ((rn Reg y)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) (splat (f32const (zero_value_f32 x))) y)))
+      (let ((rn Reg y)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (float_cmp_zero_swap cond rn vec_size))))
+
+;;;; Rules for `fcmp` 64 bit ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x (splat (f64const (zero_value_f64 y))))))
+      (let ((rn Reg x)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x (splat (f64const (zero_value_f64 y))))))
+      (let ((rn Reg x)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (float_cmp_zero cond rn vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) (splat (f64const (zero_value_f64 x))) y)))
+      (let ((rn Reg y)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (not (fcmeq0 rn vec_size) vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) (splat (f64const (zero_value_f64 x))) y)))
+      (let ((rn Reg y)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (float_cmp_zero_swap cond rn vec_size))))
+
+;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x (splat (iconst (zero_value y))))))
+      (let ((rn Reg x)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (not (cmeq0 rn vec_size) vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x (splat (iconst (zero_value y))))))
+      (let ((rn Reg x)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (int_cmp_zero cond rn vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) (splat (iconst (zero_value x))) y)))
+      (let ((rn Reg y)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (not (cmeq0 rn vec_size) vec_size))))
+
+(rule (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) (splat (iconst (zero_value x))) y)))
+      (let ((rn Reg y)
+            (vec_size VectorSize (vector_size ty)))
+          (value_reg (int_cmp_zero_swap cond rn vec_size))))
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -6,9 +6,9 @@ pub mod generated_code;
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
    writable_zero_reg, zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, AtomicRmwOp, BranchTarget,
-    CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, Imm12, ImmLogic, ImmShift,
-    Inst as MInst, JTSequenceInfo, MachLabel, MoveWideConst, NarrowValueMode, Opcode, OperandSize,
-    PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VectorSize, NZCV,
+    CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift,
+    Inst as MInst, IntCC, JTSequenceInfo, MachLabel, MoveWideConst, NarrowValueMode, Opcode,
+    OperandSize, PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
 };
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::isle::*;
@@ -286,4 +286,105 @@ where
        let amount = val.value() & u8::try_from(ty.bits() - 1).unwrap();
        ImmShift::maybe_from_u64(u64::from(ty.bits()) - u64::from(amount)).unwrap()
    }
+
+    fn icmp_zero_cond(&mut self, cond: &IntCC) -> Option<IntCC> {
+        match cond {
+            &IntCC::Equal
+            | &IntCC::SignedGreaterThanOrEqual
+            | &IntCC::SignedGreaterThan
+            | &IntCC::SignedLessThanOrEqual
+            | &IntCC::SignedLessThan => Some(*cond),
+            _ => None,
+        }
+    }
+
+    fn fcmp_zero_cond(&mut self, cond: &FloatCC) -> Option<FloatCC> {
+        match cond {
+            &FloatCC::Equal
+            | &FloatCC::GreaterThanOrEqual
+            | &FloatCC::GreaterThan
+            | &FloatCC::LessThanOrEqual
+            | &FloatCC::LessThan => Some(*cond),
+            _ => None,
+        }
+    }
+
+    fn fcmp_zero_cond_not_eq(&mut self, cond: &FloatCC) -> Option<FloatCC> {
+        match cond {
+            &FloatCC::NotEqual => Some(FloatCC::NotEqual),
+            _ => None,
+        }
+    }
+
+    fn icmp_zero_cond_not_eq(&mut self, cond: &IntCC) -> Option<IntCC> {
+        match cond {
+            &IntCC::NotEqual => Some(IntCC::NotEqual),
+            _ => None,
+        }
+    }
+
+    fn float_cc_cmp_zero_to_vec_misc_op(&mut self, cond: &FloatCC) -> VecMisc2 {
+        match cond {
+            &FloatCC::Equal => VecMisc2::Fcmeq0,
+            &FloatCC::GreaterThanOrEqual => VecMisc2::Fcmge0,
+            &FloatCC::LessThanOrEqual => VecMisc2::Fcmle0,
+            &FloatCC::GreaterThan => VecMisc2::Fcmgt0,
+            &FloatCC::LessThan => VecMisc2::Fcmlt0,
+            _ => panic!(),
+        }
+    }
+
+    fn int_cc_cmp_zero_to_vec_misc_op(&mut self, cond: &IntCC) -> VecMisc2 {
+        match cond {
+            &IntCC::Equal => VecMisc2::Cmeq0,
+            &IntCC::SignedGreaterThanOrEqual => VecMisc2::Cmge0,
+            &IntCC::SignedLessThanOrEqual => VecMisc2::Cmle0,
+            &IntCC::SignedGreaterThan => VecMisc2::Cmgt0,
+            &IntCC::SignedLessThan => VecMisc2::Cmlt0,
+            _ => panic!(),
+        }
+    }
+
+    fn float_cc_cmp_zero_to_vec_misc_op_swap(&mut self, cond: &FloatCC) -> VecMisc2 {
+        match cond {
+            &FloatCC::Equal => VecMisc2::Fcmeq0,
+            &FloatCC::GreaterThanOrEqual => VecMisc2::Fcmle0,
+            &FloatCC::LessThanOrEqual => VecMisc2::Fcmge0,
+            &FloatCC::GreaterThan => VecMisc2::Fcmlt0,
+            &FloatCC::LessThan => VecMisc2::Fcmgt0,
+            _ => panic!(),
+        }
+    }
+
+    fn int_cc_cmp_zero_to_vec_misc_op_swap(&mut self, cond: &IntCC) -> VecMisc2 {
+        match cond {
+            &IntCC::Equal => VecMisc2::Cmeq0,
+            &IntCC::SignedGreaterThanOrEqual => VecMisc2::Cmle0,
+            &IntCC::SignedLessThanOrEqual => VecMisc2::Cmge0,
+            &IntCC::SignedGreaterThan => VecMisc2::Cmlt0,
+            &IntCC::SignedLessThan => VecMisc2::Cmgt0,
+            _ => panic!(),
+        }
+    }
+
+    fn zero_value(&mut self, value: Imm64) -> Option<Imm64> {
+        if value.bits() == 0 {
+            return Some(value);
+        }
+        None
+    }
+
+    fn zero_value_f32(&mut self, value: Ieee32) -> Option<Ieee32> {
+        if value.bits() == 0 {
+            return Some(value);
+        }
+        None
+    }
+
+    fn zero_value_f64(&mut self, value: Ieee64) -> Option<Ieee64> {
+        if value.bits() == 0 {
+            return Some(value);
+        }
+        None
+    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 9ea75a6f790b5c03
 src/prelude.isle b2bc986bcbbbb77
-src/isa/aarch64/inst.isle 3678d0a37bdb4cff
-src/isa/aarch64/lower.isle 90accbfcadaea46d
+src/isa/aarch64/inst.isle 19ccefb6a496d392
+src/isa/aarch64/lower.isle 90ead921762336d2
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
--- a/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif
@@ -0,0 +1,415 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %f0(i8x16) -> b8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 0
+  v2 = splat.i8x16 v1
+  v3 = icmp eq v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmeq v0.16b, v0.16b, #0
+;   Inst 1:   ret
+; }}
+
+function %f1(i16x8) -> b16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 0
+  v2 = splat.i16x8 v1
+  v3 = icmp eq v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmeq v0.8h, v0.8h, #0
+;   Inst 1:   ret
+; }}
+
+function %f2(i32x4) -> b32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 0
+  v2 = splat.i32x4 v1
+  v3 = icmp ne v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   cmeq v0.4s, v0.4s, #0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f3(i64x2) -> b64x2 {
+block0(v0: i64x2):
+  v1 = iconst.i64 0
+  v2 = splat.i64x2 v1
+  v3 = icmp ne v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   cmeq v0.2d, v0.2d, #0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f4(i8x16) -> b8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 0
+  v2 = splat.i8x16 v1
+  v3 = icmp sle v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmle v0.16b, v0.16b, #0
+;   Inst 1:   ret
+; }}
+
+function %f5(i16x8) -> b16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 0
+  v2 = splat.i16x8 v1
+  v3 = icmp sle v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmge v0.8h, v0.8h, #0
+;   Inst 1:   ret
+; }}
+
+function %f6(i32x4) -> b32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 0
+  v2 = splat.i32x4 v1
+  v3 = icmp sge v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmge v0.4s, v0.4s, #0
+;   Inst 1:   ret
+; }}
+
+function %f7(i64x2) -> b64x2 {
+block0(v0: i64x2):
+  v1 = iconst.i64 0
+  v2 = splat.i64x2 v1
+  v3 = icmp sge v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmle v0.2d, v0.2d, #0
+;   Inst 1:   ret
+; }}
+
+function %f8(i8x16) -> b8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 0
+  v2 = splat.i8x16 v1
+  v3 = icmp slt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmlt v0.16b, v0.16b, #0
+;   Inst 1:   ret
+; }}
+
+function %f9(i16x8) -> b16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 0
+  v2 = splat.i16x8 v1
+  v3 = icmp slt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmgt v0.8h, v0.8h, #0
+;   Inst 1:   ret
+; }}
+
+function %f10(i32x4) -> b32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 0
+  v2 = splat.i32x4 v1
+  v3 = icmp sgt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmgt v0.4s, v0.4s, #0
+;   Inst 1:   ret
+; }}
+
+function %f11(i64x2) -> b64x2 {
+block0(v0: i64x2):
+  v1 = iconst.i64 0
+  v2 = splat.i64x2 v1
+  v3 = icmp sgt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmlt v0.2d, v0.2d, #0
+;   Inst 1:   ret
+; }}
+
+function %f12(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp eq v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmeq v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f13(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp eq v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmeq v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f14(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp ne v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   fcmeq v0.2d, v0.2d, #0.0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f15(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp ne v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   fcmeq v0.4s, v0.4s, #0.0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f16(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp le v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmle v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f17(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp le v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmge v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f18(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp ge v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmge v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f19(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp ge v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmle v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f20(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp lt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmlt v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f21(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp lt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmgt v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f22(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp gt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmgt v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f23(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp gt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmlt v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
--- a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif
+++ b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif
@@ -0,0 +1,255 @@
+test run
+target aarch64
+
+; raw_bitcast is needed to get around issue with "bint" on aarch64
+
+function %simd_icmp_eq_i8(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 0
+    v3 = splat.i8x16 v1
+    v2 = icmp eq v0, v3
+    v4 = raw_bitcast.i8x16 v2
+    return v4
+}
+; run: %simd_icmp_eq_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0 0xff 0 0 0 0xff 0 0 0 0xff 0 0 0 0xff 0 0]
+
+function %simd_icmp_ne_i16(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 0
+    v3 = splat.i16x8 v1
+    v2 = icmp ne v0, v3
+    v4 = raw_bitcast.i16x8 v2
+    return v4
+}
+; run: %simd_icmp_ne_i16([-1 0 1 100 -1 0 1 100]) == [0xffff 0 0xffff 0xffff 0xffff 0 0xffff 0xffff]
+
+function %simd_icmp_le_i32(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 0
+    v3 = splat.i32x4 v1
+    v2 = icmp sle v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_icmp_le_i32([-1 0 1 100]) == [0xffffffff 0xffffffff 0 0]
+
+function %simd_icmp_ge_i64(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 0
+    v3 = splat.i64x2 v1
+    v2 = icmp sge v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_icmp_ge_i64([-1 0]) == [0 0xffffffffffffffff]
+; run: %simd_icmp_ge_i64([1 100]) == [0xffffffffffffffff 0xffffffffffffffff]
+
+function %simd_icmp_lt_i8(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 0
+    v3 = splat.i8x16 v1
+    v2 = icmp slt v0, v3
+    v4 = raw_bitcast.i8x16 v2
+    return v4
+}
+; run: %simd_icmp_lt_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0xff 0 0 0 0xff 0 0 0 0xff 0 0 0 0xff 0 0 0]
+
+function %simd_icmp_gt_i16(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 0
+    v3 = splat.i16x8 v1
+    v2 = icmp sgt v0, v3
+    v4 = raw_bitcast.i16x8 v2
+    return v4
+}
+; run: %simd_icmp_gt_i16([-1 0 1 100 -1 0 1 100]) == [0 0 0xffff 0xffff 0 0 0xffff 0xffff]
+
+function %simd_fcmp_eq_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp eq v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_eq_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0 0xffffffff 0 0]
+
+function %simd_fcmp_ne_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp ne v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_ne_f64([-0x1.0 0x0.0]) == [0xffffffffffffffff 0]
+; run: %simd_fcmp_ne_f64([0x1.0 NaN]) == [0xffffffffffffffff 0xffffffffffffffff]
+
+function %simd_fcmp_le_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp le v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_le_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0xffffffff 0 0]
+
+function %simd_fcmp_ge_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp ge v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+
+; run: %simd_fcmp_ge_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
+; run: %simd_fcmp_ge_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_fcmp_lt_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp lt v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_lt_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0 0]
+
+function %simd_fcmp_gt_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp gt v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+
+; run: %simd_fcmp_gt_f64([-0x1.0 0x0.0]) == [0 0]
+; run: %simd_fcmp_gt_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_icmp_eq_i32(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 0
+    v3 = splat.i32x4 v1
+    v2 = icmp eq v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_icmp_eq_i32([1 0 -1 100]) == [0 0xffffffff 0 0]
+
+function %simd_icmp_ne_i64(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 0
+    v3 = splat.i64x2 v1
+    v2 = icmp ne v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_icmp_ne_i64([-1 0]) == [0xffffffffffffffff 0]
+; run: %simd_icmp_ne_i64([1 100]) == [0xffffffffffffffff 0xffffffffffffffff]
+
+function %simd_icmp_le_i8(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 0
+    v3 = splat.i8x16 v1
+    v2 = icmp sle v3, v0
+    v4 = raw_bitcast.i8x16 v2
+    return v4
+}
+; run: %simd_icmp_le_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0 0xff 0xff 0xff 0 0xff 0xff 0xff 0 0xff 0xff 0xff 0 0xff 0xff 0xff]
+
+function %simd_icmp_ge_i16(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 0
+    v3 = splat.i16x8 v1
+    v2 = icmp sge v3, v0
+    v4 = raw_bitcast.i16x8 v2
+    return v4
+}
+; run: %simd_icmp_ge_i16([-1 0 1 100 -1 0 1 100]) == [0xffff 0xffff 0 0 0xffff 0xffff 0 0]
+
+function %simd_icmp_lt_i32(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 0
+    v3 = splat.i32x4 v1
+    v2 = icmp slt v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_icmp_lt_i32([-1 0 1 100]) == [0 0 0xffffffff 0xffffffff]
+
+function %simd_icmp_gt_i64(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 0
+    v3 = splat.i64x2 v1
+    v2 = icmp sgt v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_icmp_gt_i64([-1 0]) == [0xffffffffffffffff 0]
+; run: %simd_icmp_gt_i64([1 100]) == [0 0]
+
+function %simd_fcmp_eq_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp eq v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_eq_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
+; run: %simd_fcmp_eq_f64([0x1.0 NaN]) == [0 0]
+
+function %simd_fcmp_ne_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp ne v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_ne_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0xffffffff 0xffffffff]
+
+function %simd_fcmp_le_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp le v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_le_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
+; run: %simd_fcmp_le_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_fcmp_ge_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp ge v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_ge_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0xffffffff 0 0]
+
+function %simd_fcmp_lt_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp lt v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_lt_f64([-0x1.0 0x0.0]) == [0 0]
+; run: %simd_fcmp_lt_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_fcmp_gt_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp gt v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_gt_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0 0]