Remove the Cranelift vselect instruction (#5918)

* Remove the Cranelift `vselect` instruction This instruction is documented as selecting lanes based on the "truthy" value of the condition lane, but the current status of the implementation of this instruction is: * x64 - uses the high bit for `f32x4` and `f64x2` and otherwise uses the high bit of each byte doing a byte-wise lane select rather than whatever the controlling type is. * AArch64 - this is the same as `bitselect` which is a bit-wise selection rather than a lane-wise selection. * s390x - this is the same as AArch64, a bit-wise selection rather than lane-wise. * interpreter - the interpreter implements the documented semantics of selecting based on "truthy" values. Coupled with the status of the implementation is the fact that this instruction is not used by WebAssembly SIMD today either. The only use of this instruction in Cranelift is the nan-canonicalization pass. By moving nan-canonicalization to `bitselect`, since that has the desired semantics, there's no longer any need for `vselect`. Given this situation this commit subsqeuently removes `vselect` and all usage of it throughout Cranelift. Closes #5917 * Review comments * Bring back vselect opts as bitselect opts * Clean up vselect usage in the interpreter * Move bitcast in nan canonicalization * Add a comment about float optimization
2023-03-07 18:42:05 -06:00
parent fc45ccc125
commit 07518dfd36
14 changed files with 163 additions and 333 deletions
--- a/cranelift/filetests/filetests/egraph/bitselect.clif
+++ b/cranelift/filetests/filetests/egraph/bitselect.clif
@@ -5,10 +5,10 @@ target x86_64
 target aarch64
 target s390x

-function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 {
+function %bitselect_sgt_to_smax(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp sgt v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -17,11 +17,11 @@ block0(v0: i32x4, v1: i32x4):
 ; check:    return v4


-; This tests an inverted vselect, where the operands are swapped.
-function %vselect_sgt_to_smax(i32x4, i32x4) -> i32x4 {
+; This tests an inverted bitselect, where the operands are swapped.
+function %bitselect_sgt_to_smax(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp sgt v0, v1
-    v3 = vselect v2, v1, v0
+    v3 = bitselect v2, v1, v0
    return v3
 }

@@ -31,10 +31,10 @@ block0(v0: i32x4, v1: i32x4):



-function %vselect_sge_to_smax(i32x4, i32x4) -> i32x4 {
+function %bitselect_sge_to_smax(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp sge v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -43,10 +43,10 @@ block0(v0: i32x4, v1: i32x4):
 ; check:    return v4


-function %vselect_ugt_to_umax(i32x4, i32x4) -> i32x4 {
+function %bitselect_ugt_to_umax(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp ugt v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -55,10 +55,10 @@ block0(v0: i32x4, v1: i32x4):
 ; check:    return v4


-function %vselect_uge_to_umax(i32x4, i32x4) -> i32x4 {
+function %bitselect_uge_to_umax(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp uge v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -68,10 +68,10 @@ block0(v0: i32x4, v1: i32x4):



-function %vselect_slt_to_smin(i32x4, i32x4) -> i32x4 {
+function %bitselect_slt_to_smin(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp slt v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -80,10 +80,10 @@ block0(v0: i32x4, v1: i32x4):
 ; check:    return v4


-function %vselect_sle_to_smin(i32x4, i32x4) -> i32x4 {
+function %bitselect_sle_to_smin(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp sle v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -92,10 +92,10 @@ block0(v0: i32x4, v1: i32x4):
 ; check:    return v4


-function %vselect_ult_to_umin(i32x4, i32x4) -> i32x4 {
+function %bitselect_ult_to_umin(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp ult v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -104,10 +104,10 @@ block0(v0: i32x4, v1: i32x4):
 ; check:    return v4


-function %vselect_ule_to_umin(i32x4, i32x4) -> i32x4 {
+function %bitselect_ule_to_umin(i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4):
    v2 = icmp ule v0, v1
-    v3 = vselect v2, v0, v1
+    v3 = bitselect v2, v0, v1
    return v3
 }

@@ -117,38 +117,14 @@ block0(v0: i32x4, v1: i32x4):



-function %vselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 {
+function %bitselect_with_different_regs_does_not_optimize(i32x4, i32x4, i32x4, i32x4) -> i32x4 {
 block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4):
    v4 = icmp ule v0, v1
-    v5 = vselect v4, v2, v3
+    v5 = bitselect v4, v2, v3
    return v5
 }

 ; check: block0(v0: i32x4, v1: i32x4, v2: i32x4, v3: i32x4):
 ; check:    v4 = icmp ule v0, v1
-; check:    v5 = vselect v4, v2, v3
+; check:    v5 = bitselect v4, v2, v3
 ; check:    return v5
-
-
-
-function %vselect_fcmp_gt_to_fmax_pseudo(f32x4, f32x4) -> f32x4 {
-block0(v0: f32x4, v1: f32x4):
-    v2 = fcmp gt v0, v1
-    v3 = vselect v2, v0, v1
-    return v3
-}
-
-; check: block0(v0: f32x4, v1: f32x4):
-; check:    v4 = fmax_pseudo v0, v1
-; check:    return v4
-
-function %vselect_fcmp_lt_to_fmin_pseudo(f32x4, f32x4) -> f32x4 {
-block0(v0: f32x4, v1: f32x4):
-    v2 = fcmp lt v0, v1
-    v3 = vselect v2, v0, v1
-    return v3
-}
-
-; check: block0(v0: f32x4, v1: f32x4):
-; check:    v4 = fmin_pseudo v0, v1
-; check:    return v4
--- a/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd-bitwise-compile.clif
@@ -173,7 +173,7 @@ block0:

 function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8, v2: i16x8):
-    v3 = vselect v0, v1, v2
+    v3 = bitselect v0, v1, v2
    return v3
 }

@@ -187,9 +187,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
 ;   bsl v0.16b, v1.16b, v2.16b
 ;   ret

-function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 {
-block0(v0: i32x4, v1: f32x4, v2: f32x4):
-    v3 = vselect v0, v1, v2
+function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = bitselect v0, v1, v2
    return v3
 }

@@ -203,9 +203,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
 ;   bsl v0.16b, v1.16b, v2.16b
 ;   ret

-function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 {
-block0(v0: i64x2, v1: f64x2, v2: f64x2):
-    v3 = vselect v0, v1, v2
+function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+    v3 = bitselect v0, v1, v2
    return v3
 }

--- a/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif
+++ b/cranelift/filetests/filetests/isa/s390x/vec-bitwise.clif
@@ -514,67 +514,3 @@ block0(v0: i8x16, v1: i8x16, v2: i8x16):
 ;   vsel %v24, %v25, %v26, %v24
 ;   br %r14

-function %vselect_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2, v2: i64x2):
-  v3 = vselect.i64x2 v0, v1, v2
-  return v3
-}
-
-; VCode:
-; block0:
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-; 
-; Disassembled:
-; block0: ; offset 0x0
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-
-function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
-block0(v0: i32x4, v1: i32x4, v2: i32x4):
-  v3 = vselect.i32x4 v0, v1, v2
-  return v3
-}
-
-; VCode:
-; block0:
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-; 
-; Disassembled:
-; block0: ; offset 0x0
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-
-function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
-block0(v0: i16x8, v1: i16x8, v2: i16x8):
-  v3 = vselect.i16x8 v0, v1, v2
-  return v3
-}
-
-; VCode:
-; block0:
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-; 
-; Disassembled:
-; block0: ; offset 0x0
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-
-function %vselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16, v2: i8x16):
-  v3 = vselect.i8x16 v0, v1, v2
-  return v3
-}
-
-; VCode:
-; block0:
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-; 
-; Disassembled:
-; block0: ; offset 0x0
-;   vsel %v24, %v25, %v26, %v24
-;   br %r14
-
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
@@ -433,7 +433,7 @@ block0(v0: f64x2, v1: f64x2):

 function %i16x8_bitselect(i16x8, i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8, v2: i16x8):
-  v3 = vselect v0, v1, v2
+  v3 = bitselect v0, v1, v2
  return v3
 }

@@ -441,7 +441,9 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   vpblendvb %xmm0, %xmm1, %xmm0, %xmm2
+;   vpand   %xmm1, %xmm0, %xmm4
+;   vpandn  %xmm0, %xmm2, %xmm6
+;   vpor    %xmm6, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -451,14 +453,16 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+;   vpand %xmm0, %xmm1, %xmm4
+;   vpandn %xmm2, %xmm0, %xmm6
+;   vpor %xmm4, %xmm6, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

-function %i32x4_bitselect(i32x4, f32x4, f32x4) -> f32x4 {
-block0(v0: i32x4, v1: f32x4, v2: f32x4):
-  v3 = vselect v0, v1, v2
+function %f32x4_bitselect(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+  v3 = bitselect v0, v1, v2
  return v3
 }

@@ -466,7 +470,9 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   vblendvps %xmm0, %xmm1, %xmm0, %xmm2
+;   vandps  %xmm1, %xmm0, %xmm4
+;   vandnps %xmm0, %xmm2, %xmm6
+;   vorps   %xmm6, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -476,14 +482,16 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+;   vandps %xmm0, %xmm1, %xmm4
+;   vandnps %xmm2, %xmm0, %xmm6
+;   vorps %xmm4, %xmm6, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

-function %i64x2_bitselect(i64x2, f64x2, f64x2) -> f64x2 {
-block0(v0: i64x2, v1: f64x2, v2: f64x2):
-  v3 = vselect v0, v1, v2
+function %f64x2_bitselect(f64x2, f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+  v3 = bitselect v0, v1, v2
  return v3
 }

@@ -491,7 +499,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   vblendvpd %xmm0, %xmm1, %xmm0, %xmm2
+;   vandpd  %xmm1, %xmm0, %xmm4
+;   vandnpd %xmm0, %xmm2, %xmm6
+;   vorpd   %xmm6, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -501,7 +511,9 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+;   vandpd %xmm0, %xmm1, %xmm4
+;   vandnpd %xmm2, %xmm0, %xmm6
+;   vorpd %xmm4, %xmm6, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -229,7 +229,7 @@ block0(v0: i32x4, v1: i32x4):

 function %vselect_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
 block0(v0: i16x8, v1: i16x8, v2: i16x8):
-    v3 = vselect v0, v1, v2
+    v3 = bitselect v0, v1, v2
    return v3
 }

@@ -237,9 +237,10 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm2, %xmm4
-;   pblendvb %xmm4, %xmm1, %xmm4
-;   movdqa  %xmm4, %xmm0
+;   movdqa  %xmm1, %xmm4
+;   pand    %xmm4, %xmm0, %xmm4
+;   pandn   %xmm0, %xmm2, %xmm0
+;   por     %xmm0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -249,16 +250,17 @@ block0(v0: i16x8, v1: i16x8, v2: i16x8):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm2, %xmm4
-;   pblendvb %xmm0, %xmm1, %xmm4
-;   movdqa %xmm4, %xmm0
+;   movdqa %xmm1, %xmm4
+;   pand %xmm0, %xmm4
+;   pandn %xmm2, %xmm0
+;   por %xmm4, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

-function %vselect_f32x4(i32x4, f32x4, f32x4) -> f32x4 {
-block0(v0: i32x4, v1: f32x4, v2: f32x4):
-    v3 = vselect v0, v1, v2
+function %vselect_f32x4(f32x4, f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4, v2: f32x4):
+    v3 = bitselect v0, v1, v2
    return v3
 }

@@ -266,9 +268,10 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm2, %xmm4
-;   blendvps %xmm4, %xmm1, %xmm4
-;   movdqa  %xmm4, %xmm0
+;   movdqa  %xmm1, %xmm4
+;   andps   %xmm4, %xmm0, %xmm4
+;   andnps  %xmm0, %xmm2, %xmm0
+;   orps    %xmm0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -278,16 +281,17 @@ block0(v0: i32x4, v1: f32x4, v2: f32x4):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm2, %xmm4
-;   blendvps %xmm0, %xmm1, %xmm4
-;   movdqa %xmm4, %xmm0
+;   movdqa %xmm1, %xmm4
+;   andps %xmm0, %xmm4
+;   andnps %xmm2, %xmm0
+;   orps %xmm4, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq

-function %vselect_f64x2(i64x2, f64x2, f64x2) -> f64x2 {
-block0(v0: i64x2, v1: f64x2, v2: f64x2):
-    v3 = vselect v0, v1, v2
+function %vselect_f64x2(f64x2, f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2, v2: f64x2):
+    v3 = bitselect v0, v1, v2
    return v3
 }

@@ -295,9 +299,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm2, %xmm4
-;   blendvpd %xmm4, %xmm1, %xmm4
-;   movdqa  %xmm4, %xmm0
+;   movdqa  %xmm1, %xmm4
+;   andpd   %xmm4, %xmm0, %xmm4
+;   andnpd  %xmm0, %xmm2, %xmm0
+;   orpd    %xmm0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -307,9 +312,10 @@ block0(v0: i64x2, v1: f64x2, v2: f64x2):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movdqa %xmm2, %xmm4
-;   blendvpd %xmm0, %xmm1, %xmm4
-;   movdqa %xmm4, %xmm0
+;   movdqa %xmm1, %xmm4
+;   andpd %xmm0, %xmm4
+;   andnpd %xmm2, %xmm0
+;   orpd %xmm4, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/runtests/simd-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-vselect.clif
@@ -1,82 +0,0 @@
-test interpret
-test run
-target s390x
-target aarch64
-set enable_simd
-target x86_64 has_sse3 has_ssse3 has_sse41
-
-function %vselect_i8x16() -> i8x16 {
-block0:
-    v1 = vconst.i8x16 [0 -1 0 -1 0 -1 -1 -1 -1 -1 0 0 0 0 0 0]
-    v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
-    v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215]
-    v4 = vselect v1, v2, v3
-    return v4
-}
-; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215]
-
-function %vselect_i16x8() -> i16x8 {
-block0:
-    v1 = vconst.i16x8 [0 -1 0 -1 0 -1 -1 -1]
-    v2 = vconst.i16x8 [100 101 102 103 104 105 106 107]
-    v3 = vconst.i16x8 [200 201 202 203 204 205 206 207]
-    v4 = vselect v1, v2, v3
-    return v4
-}
-; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107]
-
-function %vselect_i32x4_const() -> i32x4 {
-block0:
-    v1 = vconst.i32x4 [0 -1 0 -1]
-    v2 = vconst.i32x4 [100 101 102 103]
-    v3 = vconst.i32x4 [200 201 202 203]
-    v4 = vselect v1, v2, v3
-    return v4
-}
-; run: %vselect_i32x4_const() == [200 101 202 103]
-
-function %vselect_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
-block0(v0: i32x4, v1: i32x4, v2: i32x4):
-    v3 = vselect v0, v1, v2
-    return v3
-}
-; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
-; run: %vselect_i32x4([-1 -1 0 0], [1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
-
-function %vselect_i64x2() -> i64x2 {
-block0:
-    v1 = vconst.i64x2 [0 -1]
-    v2 = vconst.i64x2 [100 101]
-    v3 = vconst.i64x2 [200 201]
-    v4 = vselect v1, v2, v3
-    return v4
-}
-; run: %vselect_i64x2() == [200 101]
-
-function %vselect_p_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16, v2: i8x16):
-    v3 = vselect v0, v1, v2
-    return v3
-}
-; run: %vselect_p_i8x16([-1 0 -1 -1 -1 0 0 0 -1 0 -1 -1 -1 0 0 0], [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32]) == [1 18 3 4 5 22 23 24 9 26 11 12 13 30 31 32]
-
-function %vselect_p_i16x8(i16x8, i16x8, i16x8) -> i16x8 {
-block0(v0: i16x8, v1: i16x8, v2: i16x8):
-    v3 = vselect v0, v1, v2
-    return v3
-}
-; run: %vselect_p_i16x8([-1 0 -1 -1 -1 0 0 0], [1 2 3 4 5 6 7 8], [17 18 19 20 21 22 23 24]) == [1 18 3 4 5 22 23 24]
-
-function %vselect_p_i32x4(i32x4, i32x4, i32x4) -> i32x4 {
-block0(v0: i32x4, v1: i32x4, v2: i32x4):
-    v3 = vselect v0, v1, v2
-    return v3
-}
-; run: %vselect_p_i32x4([-1 0 -1 -1], [1 2 3 4], [100000 200000 300000 400000]) == [1 200000 3 4]
-
-function %vselect_p_i64x2(i64x2, i64x2, i64x2) -> i64x2 {
-block0(v0: i64x2, v1: i64x2, v2: i64x2):
-    v3 = vselect v0, v1, v2
-    return v3
-}
-; run: %vselect_p_i64x2([-1 0], [1 2], [100000000000 200000000000]) == [1 200000000000]