cranelift: Align Scalar and SIMD shift semantics (#4520)

* cranelift: Reorganize test suite Group some SIMD operations by instruction. * cranelift: Deduplicate some shift tests Also, new tests with the mod behaviour * aarch64: Lower shifts with mod behaviour * x64: Lower shifts with mod behaviour * wasmtime: Don't mask SIMD shifts
2022-07-27 18:54:00 +01:00
parent e121c209fc
commit 0508932174
15 changed files with 314 additions and 423 deletions
--- a/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/arithmetic.clif
@@ -344,9 +344,10 @@ block0(v0: i8x16):

 ; block0:
 ;   movz x3, #1
-;   sub w5, wzr, w3
-;   dup v7.16b, w5
-;   ushl v0.16b, v0.16b, v7.16b
+;   and w5, w3, #7
+;   sub x7, xzr, x5
+;   dup v17.16b, w7
+;   ushl v0.16b, v0.16b, v17.16b
 ;   ret

 function %add_i128(i128, i128) -> i128 {
@@ -492,4 +493,3 @@ block0(v0: i64):
 ;   b.vc 8 ; udf
 ;   sdiv x0, x0, x3
 ;   ret
-
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-compile.clif
@@ -206,12 +206,13 @@ block0(v0: i32):
 ;   movq    %rsp, %rbp
 ; block0:
 ;   load_const VCodeConstant(1), %xmm0
-;   movd    %edi, %xmm5
-;   psllw   %xmm0, %xmm5, %xmm0
-;   lea     const(VCodeConstant(0)), %rsi
+;   andq    %rdi, $7, %rdi
+;   movd    %edi, %xmm7
+;   psllw   %xmm0, %xmm7, %xmm0
+;   lea     const(VCodeConstant(0)), %rax
 ;   shlq    $4, %rdi, %rdi
-;   movdqu  0(%rsi,%rdi,1), %xmm13
-;   pand    %xmm0, %xmm13, %xmm0
+;   movdqu  0(%rax,%rdi,1), %xmm15
+;   pand    %xmm0, %xmm15, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -228,9 +229,14 @@ block0:
 ;   movq    %rsp, %rbp
 ; block0:
 ;   load_const VCodeConstant(1), %xmm0
-;   psrlw   %xmm0, $1, %xmm0
-;   movdqu  const(VCodeConstant(0)), %xmm5
-;   pand    %xmm0, %xmm5, %xmm0
+;   movl    $1, %r11d
+;   andq    %r11, $7, %r11
+;   movd    %r11d, %xmm7
+;   psrlw   %xmm0, %xmm7, %xmm0
+;   lea     const(VCodeConstant(0)), %rax
+;   shlq    $4, %r11, %r11
+;   movdqu  0(%rax,%r11,1), %xmm15
+;   pand    %xmm0, %xmm15, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -245,15 +251,16 @@ block0(v0: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   load_const VCodeConstant(0), %xmm9
-;   movdqa  %xmm9, %xmm0
-;   punpcklbw %xmm0, %xmm9, %xmm0
-;   punpckhbw %xmm9, %xmm9, %xmm9
+;   load_const VCodeConstant(0), %xmm10
+;   andq    %rdi, $7, %rdi
+;   movdqa  %xmm10, %xmm0
+;   punpcklbw %xmm0, %xmm10, %xmm0
+;   punpckhbw %xmm10, %xmm10, %xmm10
 ;   addl    %edi, $8, %edi
-;   movd    %edi, %xmm11
-;   psraw   %xmm0, %xmm11, %xmm0
-;   psraw   %xmm9, %xmm11, %xmm9
-;   packsswb %xmm0, %xmm9, %xmm0
+;   movd    %edi, %xmm13
+;   psraw   %xmm0, %xmm13, %xmm0
+;   psraw   %xmm10, %xmm13, %xmm10
+;   packsswb %xmm0, %xmm10, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -267,17 +274,19 @@ block0(v0: i8x16, v1: i32):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movdqa  %xmm0, %xmm9
-;   punpcklbw %xmm9, %xmm0, %xmm9
+;   movl    $3, %esi
+;   andq    %rsi, $7, %rsi
+;   movdqa  %xmm0, %xmm15
+;   punpcklbw %xmm15, %xmm0, %xmm15
+;   movdqa  %xmm15, %xmm13
 ;   punpckhbw %xmm0, %xmm0, %xmm0
-;   movdqa  %xmm9, %xmm12
-;   psraw   %xmm12, $11, %xmm12
-;   movdqa  %xmm12, %xmm9
-;   psraw   %xmm0, $11, %xmm0
-;   movdqa  %xmm9, %xmm1
-;   packsswb %xmm1, %xmm0, %xmm1
-;   movdqa  %xmm1, %xmm9
-;   movdqa  %xmm9, %xmm0
+;   movdqa  %xmm0, %xmm7
+;   addl    %esi, $8, %esi
+;   movd    %esi, %xmm15
+;   movdqa  %xmm13, %xmm0
+;   psraw   %xmm0, %xmm15, %xmm0
+;   psraw   %xmm7, %xmm15, %xmm7
+;   packsswb %xmm0, %xmm7, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
--- a/cranelift/filetests/filetests/runtests/simd-bitselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect.clif
@@ -13,3 +13,33 @@ block0(v0: i32x4, v1: i32x4, v2: i32x4):
 ; run: %bitselect_i32x4(0x11111111111111111111111111111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x11111111111111111111111111111111
 ; run: %bitselect_i32x4(0x01010011000011110000000011111111, 0x11111111111111111111111111111111, 0x00000000000000000000000000000000) == 0x01010011000011110000000011111111
 ; run: %bitselect_i32x4(0x00000000000000001111111111111111, 0x00000000000000000000000000000000, 0x11111111111111111111111111111111) == 0x11111111111111110000000000000000
+
+function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i8x16):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
+; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+
+function %bitselect_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255]  ; the selector vector
+    v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
+    v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
+    v3 = bitselect v0, v1, v2
+
+    v4 = extractlane v3, 0
+    v5 = icmp_imm eq v4, 42
+
+    v6 = extractlane v3, 1
+    v7 = icmp_imm eq v6, 0
+
+    v8 = extractlane v3, 15
+    v9 = icmp_imm eq v8, 42
+
+    v10 = band v5, v7
+    v11 = band v10, v9
+    return v11
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif
@@ -1,216 +0,0 @@
-test run
-set enable_simd
-target aarch64
-; target s390x FIXME: s390x implements modulo semantics for shift counts
-target x86_64 skylake
-
-; TODO: once available, replace all lane extraction with `icmp + all_ones`
-
-function %ishl_i32x4() -> b1 {
-block0:
-    v0 = iconst.i32 1
-    v1 = vconst.i32x4 [1 2 4 8]
-    v2 = ishl v1, v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 2
-
-    v5 = extractlane v2, 3
-    v6 = icmp_imm eq v5, 16
-
-    v7 = band v4, v6
-    return v7
-}
-; run
-
-function %ishl_too_large_i16x8() -> b1 {
-block0:
-    v0 = iconst.i32 17 ; note that this will shift off the end of each lane
-    v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
-    v2 = ishl v1, v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0
-
-    v5 = extractlane v2, 3
-    v6 = icmp_imm eq v5, 0
-
-    v7 = band v4, v6
-    return v7
-}
-; run
-
-function %ushr_i8x16() -> b1 {
-block0:
-    v0 = iconst.i32 1
-    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
-    v2 = ushr v1, v0
-
-    v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
-}
-; run
-
-function %sshr_i8x16() -> b1 {
-block0:
-    v0 = iconst.i32 1
-    v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1]
-    v2 = sshr v1, v0
-
-    v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
-}
-; run
-
-function %ishl_i8x16() -> b1 {
-block0:
-    v0 = iconst.i32 1
-    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
-    v2 = ishl v1, v0
-
-    v3 = vconst.i8x16 [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
-}
-; run
-
-function %ushr_i64x2() -> b1 {
-block0:
-    v0 = iconst.i32 1
-    v1 = vconst.i64x2 [1 2]
-    v2 = ushr v1, v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0
-
-    v5 = extractlane v2, 1
-    v6 = icmp_imm eq v5, 1
-
-    v7 = band v4, v6
-    return v7
-}
-; run
-
-function %ushr_too_large_i32x4() -> b1 {
-block0:
-    v0 = iconst.i32 33 ; note that this will shift off the end of each lane
-    v1 = vconst.i32x4 [1 2 4 8]
-    v2 = ushr v1, v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0
-
-    v5 = extractlane v2, 3
-    v6 = icmp_imm eq v5, 0
-
-    v7 = band v4, v6
-    return v7
-}
-; run
-
-function %sshr_i16x8() -> b1 {
-block0:
-    v0 = iconst.i32 1
-    v1 = vconst.i16x8 [-1 2 4 8 -16 32 64 128]
-    v2 = sshr v1, v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0xffff ; because of the shifted-in sign-bit, this remains 0xffff == -1
-
-    v5 = extractlane v2, 4
-    v6 = icmp_imm eq v5, 0xfff8 ; -16 has been shifted to -8 == 0xfff8
-
-    v7 = band v4, v6
-    return v7
-}
-; run
-
-function %sshr_too_large_i32x4() -> b1 {
-block0:
-    v0 = iconst.i32 33 ; note that this will shift off the end of each lane
-    v1 = vconst.i32x4 [1 2 4 -8]
-    v2 = sshr v1, v0
-
-    v3 = extractlane v2, 0
-    v4 = icmp_imm eq v3, 0
-
-    v5 = extractlane v2, 3
-    v6 = icmp_imm eq v5, 0xffff_ffff ; shifting in the sign-bit repeatedly fills the result with 1s
-
-    v7 = band v4, v6
-    return v7
-}
-; run
-
-function %sshr_i64x2(i64x2, i32) -> i64x2 {
-block0(v0:i64x2, v1:i32):
-    v2 = sshr v0, v1
-    return v2
-}
-; run: %sshr_i64x2([1 -1], 0) == [1 -1]
-; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
-; run: %sshr_i64x2([2 -2], 1) == [1 -1]
-; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
-
-function %bitselect_i8x16() -> b1 {
-block0:
-    v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255]  ; the selector vector
-    v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
-    v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
-    v3 = bitselect v0, v1, v2
-
-    v4 = extractlane v3, 0
-    v5 = icmp_imm eq v4, 42
-
-    v6 = extractlane v3, 1
-    v7 = icmp_imm eq v6, 0
-
-    v8 = extractlane v3, 15
-    v9 = icmp_imm eq v8, 42
-
-    v10 = band v5, v7
-    v11 = band v10, v9
-    return v11
-}
-; run
-
-function %sshr_imm_i32x4() -> b1 {
-block0:
-    v1 = vconst.i32x4 [1 2 4 -8]
-    v2 = sshr_imm v1, 1
-
-    v3 = vconst.i32x4 [0 1 2 -4]
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
-}
-; run
-
-function %sshr_imm_i16x8() -> b1 {
-block0:
-    v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0]
-    v2 = ushr_imm v1, 1
-
-    v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
-}
-; run
-
-function %ishl_imm_i64x2() -> b1 {
-block0:
-    v1 = vconst.i64x2 [1 0]
-    v2 = ishl_imm v1, 1
-
-    v3 = vconst.i64x2 [2 0]
-    v4 = icmp eq v2, v3
-    v5 = vall_true v4
-    return v5
-}
-; run
--- a/cranelift/filetests/filetests/runtests/simd-bitwise.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitwise.clif
@@ -1,132 +0,0 @@
-test run
-target aarch64
-; target s390x FIXME: s390x implements modulo semantics for shift counts
-set enable_simd
-target x86_64 skylake
-
-function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
-block0(v0: i8x16, v1: i8x16, v2: i8x16):
-    v3 = bitselect v0, v1, v2
-    return v3
-}
-; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
-; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
-
-function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
-block0(v1: i32x4, v2: i32x4):
-    ; `make_trampoline` still does not know how to convert boolean vector types
-    ; so we load the value directly here.
-    v0 = vconst.b32x4 [true true false false]
-    v3 = vselect v0, v1, v2
-    return v3
-}
-; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
-; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
-
-
-
-; shift left
-
-function %ishl_i8x16(i8x16, i32) -> i8x16 {
-block0(v0: i8x16, v1: i32):
-    v2 = ishl v0, v1
-    return v2
-}
-; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
-
-function %ishl_i16x8(i16x8, i32) -> i16x8 {
-block0(v0: i16x8, v1: i32):
-    v2 = ishl v0, v1
-    return v2
-}
-; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0]
-
-function %ishl_i32x4(i32x4, i32) -> i32x4 {
-block0(v0: i32x4, v1: i32):
-    v2 = ishl v0, v1
-    return v2
-}
-; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
-
-function %ishl_imm_i64x2(i64x2) -> i64x2 {
-block0(v0: i64x2):
-    v2 = ishl_imm v0, 1
-    return v2
-}
-; run: %ishl_imm_i64x2([1 0]) == [2 0]
-
-
-
-; shift right (logical)
-
-function %ushr_i8x16(i8x16, i32) -> i8x16 {
-block0(v0: i8x16, v1: i32):
-    v2 = ushr v0, v1
-    return v2
-}
-; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
-
-function %ushr_i32x4(i32x4, i32) -> i32x4 {
-block0(v0: i32x4, v1: i32):
-    v2 = ushr v0, v1
-    return v2
-}
-; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0]
-
-function %ushr_i64x2(i64x2, i32) -> i64x2 {
-block0(v0: i64x2, v1: i32):
-    v2 = ushr v0, v1
-    return v2
-}
-; run: %ushr_i64x2([1 2], 1) == [0 1]
-
-
-
-; shift right (arithmetic)
-
-function %sshr_i8x16(i8x16, i32) -> i8x16 {
-block0(v0: i8x16, v1: i32):
-    v2 = sshr v0, v1
-    return v2
-}
-; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
-
-function %sshr_i16x8(i16x8, i32) -> i16x8 {
-block0(v0: i16x8, v1: i32):
-    v2 = sshr v0, v1
-    return v2
-}
-; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
-; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
-
-function %sshr_i32x4(i32x4, i32) -> i32x4 {
-block0(v0: i32x4, v1: i32):
-    v2 = sshr v0, v1
-    return v2
-}
-; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff)
-; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff]
-
-function %sshr_i64x2(i64x2, i32) -> i64x2 {
-block0(v0:i64x2, v1:i32):
-    v2 = sshr v0, v1
-    return v2
-}
-; run: %sshr_i64x2([1 -1], 0) == [1 -1]
-; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
-; run: %sshr_i64x2([2 -2], 1) == [1 -1]
-; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
-
-function %sshr_imm_i32x4(i32x4) -> i32x4 {
-block0(v0: i32x4):
-    v1 = sshr_imm v0, 1
-    return v1
-}
-; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
-
-function %sshr_imm_i16x8(i16x8) -> i16x8 {
-block0(v0: i16x8):
-    v1 = sshr_imm v0, 1
-    return v1
-}
-; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-ishl.clif
+++ b/cranelift/filetests/filetests/runtests/simd-ishl.clif
@@ -0,0 +1,46 @@
+test run
+set enable_simd
+target aarch64
+target s390x
+target x86_64 skylake
+
+
+function %ishl_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
+; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 12) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
+
+function %ishl_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 1) == [2 4 8 16 32 64 128 256]
+; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [2 4 8 16 32 64 128 256]
+
+function %ishl_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
+; run: %ishl_i32x4([1 2 4 8], 33) == [2 4 8 16]
+
+function %ishl_i64x2(i64x2, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i64x2([1 2], 1) == [2 4]
+; run: %ishl_i64x2([1 2], 65) == [2 4]
+
+
+function %ishl_imm_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v2 = ishl_imm v0, 1
+    return v2
+}
+; run: %ishl_imm_i64x2([1 0]) == [2 0]
--- a/cranelift/filetests/filetests/runtests/simd-sshr.clif
+++ b/cranelift/filetests/filetests/runtests/simd-sshr.clif
@@ -0,0 +1,58 @@
+test run
+set enable_simd
+target aarch64
+target s390x
+target x86_64 skylake
+
+
+function %sshr_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
+; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 9) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
+
+function %sshr_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
+; run: %sshr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
+; run: %sshr_i16x8([-1 2 4 8 -16 32 64 128], 17) == [-1 1 2 4 -8 16 32 64]
+
+function %sshr_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i32x4([1 2 4 -8], 1) == [0 1 2 -4]
+; run: %sshr_i32x4([1 2 4 -8], 33) == [0 1 2 -4]
+
+function %sshr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0:i64x2, v1:i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i64x2([1 -1], 0) == [1 -1]
+; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
+; run: %sshr_i64x2([2 -2], 1) == [1 -1]
+; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
+; run: %sshr_i64x2([2 -2], 65) == [1 -1]
+
+
+
+function %sshr_imm_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = sshr_imm v0, 1
+    return v1
+}
+; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
+
+function %sshr_imm_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = sshr_imm v0, 1
+    return v1
+}
+; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-ushr.clif
+++ b/cranelift/filetests/filetests/runtests/simd-ushr.clif
@@ -0,0 +1,52 @@
+test run
+set enable_simd
+target aarch64
+target s390x
+target x86_64 skylake
+
+
+function %ushr_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 9) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+
+function %ushr_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i16x8([0 1 2 3 4 5 6 7], 1) == [0 0 1 1 2 2 3 3]
+; run: %ushr_i16x8([0 1 2 3 4 5 6 7], 17) == [0 0 1 1 2 2 3 3]
+; run: %ushr_i16x8([1 2 4 -8 0 0 0 0], 1) == [0 1 2 32764 0 0 0 0]
+
+function %ushr_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i32x4([1 2 4 8], 1) == [0 1 2 4]
+; run: %ushr_i32x4([1 2 4 8], 33) == [0 1 2 4]
+
+function %ushr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i64x2([1 2], 1) == [0 1]
+; run: %ushr_i64x2([1 2], 65) == [0 1]
+
+
+function %sshr_imm_i16x8() -> b1 {
+block0:
+    v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0]
+    v2 = ushr_imm v1, 1
+
+    v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-vselect.clif
@@ -72,3 +72,15 @@ block0(v0: b64x2, v1: i64x2, v2: i64x2):
    return v3
 }
 ; run: %vselect_p_i64x2([true false], [1 2], [100000000000 200000000000]) == [1 200000000000]
+
+
+function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v1: i32x4, v2: i32x4):
+    ; `make_trampoline` still does not know how to convert boolean vector types
+    ; so we load the value directly here.
+    v0 = vconst.b32x4 [true true false false]
+    v3 = vselect v0, v1, v2
+    return v3
+}
+; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
+; run: %vselect_i32x4([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]