Add vector compare to 0 optims (#3887)

Signed-off-by: Freddie Liardet <frederick.liardet@arm.com>
2022-03-10 00:20:06 +00:00
parent 8b48ce7fb7
commit 13b9396931
10 changed files with 1748 additions and 162 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/compare_zero.clif
@@ -0,0 +1,415 @@
+test compile precise-output
+set unwind_info=false
+target aarch64
+
+function %f0(i8x16) -> b8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 0
+  v2 = splat.i8x16 v1
+  v3 = icmp eq v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmeq v0.16b, v0.16b, #0
+;   Inst 1:   ret
+; }}
+
+function %f1(i16x8) -> b16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 0
+  v2 = splat.i16x8 v1
+  v3 = icmp eq v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmeq v0.8h, v0.8h, #0
+;   Inst 1:   ret
+; }}
+
+function %f2(i32x4) -> b32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 0
+  v2 = splat.i32x4 v1
+  v3 = icmp ne v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   cmeq v0.4s, v0.4s, #0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f3(i64x2) -> b64x2 {
+block0(v0: i64x2):
+  v1 = iconst.i64 0
+  v2 = splat.i64x2 v1
+  v3 = icmp ne v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   cmeq v0.2d, v0.2d, #0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f4(i8x16) -> b8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 0
+  v2 = splat.i8x16 v1
+  v3 = icmp sle v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmle v0.16b, v0.16b, #0
+;   Inst 1:   ret
+; }}
+
+function %f5(i16x8) -> b16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 0
+  v2 = splat.i16x8 v1
+  v3 = icmp sle v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmge v0.8h, v0.8h, #0
+;   Inst 1:   ret
+; }}
+
+function %f6(i32x4) -> b32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 0
+  v2 = splat.i32x4 v1
+  v3 = icmp sge v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmge v0.4s, v0.4s, #0
+;   Inst 1:   ret
+; }}
+
+function %f7(i64x2) -> b64x2 {
+block0(v0: i64x2):
+  v1 = iconst.i64 0
+  v2 = splat.i64x2 v1
+  v3 = icmp sge v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmle v0.2d, v0.2d, #0
+;   Inst 1:   ret
+; }}
+
+function %f8(i8x16) -> b8x16 {
+block0(v0: i8x16):
+  v1 = iconst.i8 0
+  v2 = splat.i8x16 v1
+  v3 = icmp slt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmlt v0.16b, v0.16b, #0
+;   Inst 1:   ret
+; }}
+
+function %f9(i16x8) -> b16x8 {
+block0(v0: i16x8):
+  v1 = iconst.i16 0
+  v2 = splat.i16x8 v1
+  v3 = icmp slt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmgt v0.8h, v0.8h, #0
+;   Inst 1:   ret
+; }}
+
+function %f10(i32x4) -> b32x4 {
+block0(v0: i32x4):
+  v1 = iconst.i32 0
+  v2 = splat.i32x4 v1
+  v3 = icmp sgt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmgt v0.4s, v0.4s, #0
+;   Inst 1:   ret
+; }}
+
+function %f11(i64x2) -> b64x2 {
+block0(v0: i64x2):
+  v1 = iconst.i64 0
+  v2 = splat.i64x2 v1
+  v3 = icmp sgt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   cmlt v0.2d, v0.2d, #0
+;   Inst 1:   ret
+; }}
+
+function %f12(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp eq v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmeq v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f13(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp eq v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmeq v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f14(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp ne v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   fcmeq v0.2d, v0.2d, #0.0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f15(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp ne v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 3)
+;   Inst 0:   fcmeq v0.4s, v0.4s, #0.0
+;   Inst 1:   mvn v0.16b, v0.16b
+;   Inst 2:   ret
+; }}
+
+function %f16(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp le v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmle v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f17(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp le v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmge v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f18(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp ge v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmge v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f19(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp ge v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmle v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f20(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp lt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmlt v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f21(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp lt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmgt v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f22(f64x2) -> b64x2 {
+block0(v0: f64x2):
+  v1 = f64const 0.0
+  v2 = splat.f64x2 v1
+  v3 = fcmp gt v0, v2
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmgt v0.2d, v0.2d, #0.0
+;   Inst 1:   ret
+; }}
+
+function %f23(f32x4) -> b32x4 {
+block0(v0: f32x4):
+  v1 = f32const 0.0
+  v2 = splat.f32x4 v1
+  v3 = fcmp gt v2, v0
+  return v3
+}
+
+; VCode_ShowWithRRU {{
+;   Entry block: 0
+; Block 0:
+;   (original IR block: block0)
+;   (instruction range: 0 .. 2)
+;   Inst 0:   fcmlt v0.4s, v0.4s, #0.0
+;   Inst 1:   ret
+; }}
--- a/cranelift/filetests/filetests/runtests/simd_compare_zero.clif
+++ b/cranelift/filetests/filetests/runtests/simd_compare_zero.clif
@@ -0,0 +1,255 @@
+test run
+target aarch64
+
+; raw_bitcast is needed to get around issue with "bint" on aarch64
+
+function %simd_icmp_eq_i8(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 0
+    v3 = splat.i8x16 v1
+    v2 = icmp eq v0, v3
+    v4 = raw_bitcast.i8x16 v2
+    return v4
+}
+; run: %simd_icmp_eq_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0 0xff 0 0 0 0xff 0 0 0 0xff 0 0 0 0xff 0 0]
+
+function %simd_icmp_ne_i16(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 0
+    v3 = splat.i16x8 v1
+    v2 = icmp ne v0, v3
+    v4 = raw_bitcast.i16x8 v2
+    return v4
+}
+; run: %simd_icmp_ne_i16([-1 0 1 100 -1 0 1 100]) == [0xffff 0 0xffff 0xffff 0xffff 0 0xffff 0xffff]
+
+function %simd_icmp_le_i32(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 0
+    v3 = splat.i32x4 v1
+    v2 = icmp sle v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_icmp_le_i32([-1 0 1 100]) == [0xffffffff 0xffffffff 0 0]
+
+function %simd_icmp_ge_i64(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 0
+    v3 = splat.i64x2 v1
+    v2 = icmp sge v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_icmp_ge_i64([-1 0]) == [0 0xffffffffffffffff]
+; run: %simd_icmp_ge_i64([1 100]) == [0xffffffffffffffff 0xffffffffffffffff]
+
+function %simd_icmp_lt_i8(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 0
+    v3 = splat.i8x16 v1
+    v2 = icmp slt v0, v3
+    v4 = raw_bitcast.i8x16 v2
+    return v4
+}
+; run: %simd_icmp_lt_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0xff 0 0 0 0xff 0 0 0 0xff 0 0 0 0xff 0 0 0]
+
+function %simd_icmp_gt_i16(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 0
+    v3 = splat.i16x8 v1
+    v2 = icmp sgt v0, v3
+    v4 = raw_bitcast.i16x8 v2
+    return v4
+}
+; run: %simd_icmp_gt_i16([-1 0 1 100 -1 0 1 100]) == [0 0 0xffff 0xffff 0 0 0xffff 0xffff]
+
+function %simd_fcmp_eq_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp eq v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_eq_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0 0xffffffff 0 0]
+
+function %simd_fcmp_ne_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp ne v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_ne_f64([-0x1.0 0x0.0]) == [0xffffffffffffffff 0]
+; run: %simd_fcmp_ne_f64([0x1.0 NaN]) == [0xffffffffffffffff 0xffffffffffffffff]
+
+function %simd_fcmp_le_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp le v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_le_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0xffffffff 0 0]
+
+function %simd_fcmp_ge_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp ge v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+
+; run: %simd_fcmp_ge_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
+; run: %simd_fcmp_ge_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_fcmp_lt_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp lt v0, v3
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_lt_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0 0]
+
+function %simd_fcmp_gt_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp gt v0, v3
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+
+; run: %simd_fcmp_gt_f64([-0x1.0 0x0.0]) == [0 0]
+; run: %simd_fcmp_gt_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_icmp_eq_i32(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 0
+    v3 = splat.i32x4 v1
+    v2 = icmp eq v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_icmp_eq_i32([1 0 -1 100]) == [0 0xffffffff 0 0]
+
+function %simd_icmp_ne_i64(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 0
+    v3 = splat.i64x2 v1
+    v2 = icmp ne v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_icmp_ne_i64([-1 0]) == [0xffffffffffffffff 0]
+; run: %simd_icmp_ne_i64([1 100]) == [0xffffffffffffffff 0xffffffffffffffff]
+
+function %simd_icmp_le_i8(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 0
+    v3 = splat.i8x16 v1
+    v2 = icmp sle v3, v0
+    v4 = raw_bitcast.i8x16 v2
+    return v4
+}
+; run: %simd_icmp_le_i8([-1 0 1 100 -1 0 1 100 -1 0 1 100 -1 0 1 100]) == [0 0xff 0xff 0xff 0 0xff 0xff 0xff 0 0xff 0xff 0xff 0 0xff 0xff 0xff]
+
+function %simd_icmp_ge_i16(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = iconst.i16 0
+    v3 = splat.i16x8 v1
+    v2 = icmp sge v3, v0
+    v4 = raw_bitcast.i16x8 v2
+    return v4
+}
+; run: %simd_icmp_ge_i16([-1 0 1 100 -1 0 1 100]) == [0xffff 0xffff 0 0 0xffff 0xffff 0 0]
+
+function %simd_icmp_lt_i32(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iconst.i32 0
+    v3 = splat.i32x4 v1
+    v2 = icmp slt v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_icmp_lt_i32([-1 0 1 100]) == [0 0 0xffffffff 0xffffffff]
+
+function %simd_icmp_gt_i64(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v1 = iconst.i64 0
+    v3 = splat.i64x2 v1
+    v2 = icmp sgt v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_icmp_gt_i64([-1 0]) == [0xffffffffffffffff 0]
+; run: %simd_icmp_gt_i64([1 100]) == [0 0]
+
+function %simd_fcmp_eq_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp eq v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_eq_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
+; run: %simd_fcmp_eq_f64([0x1.0 NaN]) == [0 0]
+
+function %simd_fcmp_ne_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp ne v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_ne_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0xffffffff 0xffffffff]
+
+function %simd_fcmp_le_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp le v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_le_f64([-0x1.0 0x0.0]) == [0 0xffffffffffffffff]
+; run: %simd_fcmp_le_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_fcmp_ge_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp ge v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_ge_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0xffffffff 0 0]
+
+function %simd_fcmp_lt_f64(f64x2) -> i64x2 {
+block0(v0: f64x2):
+    v1 = f64const 0.0
+    v3 = splat.f64x2 v1
+    v2 = fcmp lt v3, v0
+    v4 = raw_bitcast.i64x2 v2
+    return v4
+}
+; run: %simd_fcmp_lt_f64([-0x1.0 0x0.0]) == [0 0]
+; run: %simd_fcmp_lt_f64([0x1.0 NaN]) == [0xffffffffffffffff 0]
+
+function %simd_fcmp_gt_f32(f32x4) -> i32x4 {
+block0(v0: f32x4):
+    v1 = f32const 0.0
+    v3 = splat.f32x4 v1
+    v2 = fcmp gt v3, v0
+    v4 = raw_bitcast.i32x4 v2
+    return v4
+}
+; run: %simd_fcmp_gt_f32([-0x1.0 0x0.0 0x1.0 NaN]) == [0xffffffff 0 0 0]