cranelift: Merge all run tests into runtests dir

With this change we now reuse tests across multiple arches. Duplicate tests were merged into the same file where possible. Some legacy x86 tests were left in separate files due to incompatibilities with the rest of the test suite.
2021-06-03 20:01:38 +01:00
parent e25bf362ab
commit 214755c6a0
43 changed files with 618 additions and 1115 deletions
--- a/cranelift/filetests/filetests/runtests/bitops.clif
+++ b/cranelift/filetests/filetests/runtests/bitops.clif
@@ -0,0 +1,17 @@
+test run
+target aarch64
+target arm
+target s390x
+; target x86_64 machinst TODO: Not yet implemented on x86_64
+target x86_64 legacy
+
+
+function %bnot_band() -> b1 {
+block0:
+    v1 = bconst.b1 false
+    v2 = bconst.b1 true
+    v3 = bnot v1
+    v4 = band v3, v2
+    return v4
+}
+; run
--- a/cranelift/filetests/filetests/runtests/br.clif
+++ b/cranelift/filetests/filetests/runtests/br.clif
@@ -0,0 +1,38 @@
+test run
+target aarch64
+target arm
+target s390x
+target x86_64 machinst
+target x86_64 legacy
+
+function u0:0() -> b1 {
+block0:
+    v0 = iconst.i8 0
+    brz v0, block1
+    jump block2
+
+block1:
+    v1 = bconst.b1 true
+    return v1
+
+block2:
+    v2 = bconst.b1 false
+    return v2
+}
+; run
+
+function u0:1() -> b1 {
+block0:
+    v0 = iconst.i8 0
+    brnz v0, block1
+    jump block2
+
+block1:
+    v1 = bconst.b1 false
+    return v1
+
+block2:
+    v2 = bconst.b1 true
+    return v2
+}
+; run
--- a/cranelift/filetests/filetests/runtests/const.clif
+++ b/cranelift/filetests/filetests/runtests/const.clif
@@ -0,0 +1,155 @@
+test run
+target aarch64
+target arm
+target s390x
+target x86_64 machinst
+target x86_64 legacy
+
+function %i8_iconst_0() -> i8 {
+block0:
+    v1 = iconst.i8 0
+    return v1
+}
+; run: %i8_iconst_0() == 0
+
+function %i8_iconst_1() -> i8 {
+block0:
+    v1 = iconst.i8 1
+    return v1
+}
+; run: %i8_iconst_1() == 1
+
+function %i8_iconst_neg_one() -> i8 {
+block0:
+    v1 = iconst.i8 -1
+    return v1
+}
+; run: %i8_iconst_neg_one() == -1
+
+
+function %i16_iconst_0() -> i16 {
+block0:
+    v1 = iconst.i16 0
+    return v1
+}
+; run: %i16_iconst_0() == 0
+
+function %i16_iconst_1() -> i16 {
+block0:
+    v1 = iconst.i16 1
+    return v1
+}
+; run: %i16_iconst_1() == 1
+
+function %i16_iconst_neg_one() -> i16 {
+block0:
+    v1 = iconst.i16 -1
+    return v1
+}
+; run: %i16_iconst_neg_one() == -1
+
+
+function %i32_iconst_0() -> i32 {
+block0:
+    v1 = iconst.i32 0
+    return v1
+}
+; run: %i32_iconst_0() == 0
+
+function %i32_iconst_1() -> i32 {
+block0:
+    v1 = iconst.i32 1
+    return v1
+}
+; run: %i32_iconst_1() == 1
+
+function %i32_iconst_neg_one() -> i32 {
+block0:
+    v1 = iconst.i32 -1
+    return v1
+}
+; run: %i32_iconst_neg_one() == -1
+
+
+function %i64_iconst_0() -> i64 {
+block0:
+    v1 = iconst.i64 0
+    return v1
+}
+; run: %i64_iconst_0() == 0
+
+function %i64_iconst_1() -> i64 {
+block0:
+    v1 = iconst.i64 1
+    return v1
+}
+; run: %i64_iconst_1() == 1
+
+function %i64_iconst_neg_one() -> i64 {
+block0:
+    v1 = iconst.i64 -1
+    return v1
+}
+; run: %i64_iconst_neg_one() == -1
+
+
+
+function %b8_bconst_false() -> b8 {
+block0:
+    v1 = bconst.b8 false
+    return v1
+}
+; run: %b8_bconst_false() == false
+
+function %b8_bconst_true() -> b8 {
+block0:
+    v1 = bconst.b8 true
+    return v1
+}
+; run: %b8_bconst_true() == true
+
+
+function %b16_bconst_false() -> b16 {
+block0:
+    v1 = bconst.b16 false
+    return v1
+}
+; run: %b16_bconst_false() == false
+
+function %b16_bconst_true() -> b16 {
+block0:
+    v1 = bconst.b16 true
+    return v1
+}
+; run: %b16_bconst_true() == true
+
+
+function %b32_bconst_false() -> b32 {
+block0:
+    v1 = bconst.b32 false
+    return v1
+}
+; run: %b32_bconst_false() == false
+
+function %b32_bconst_true() -> b32 {
+block0:
+    v1 = bconst.b32 true
+    return v1
+}
+; run: %b32_bconst_true() == true
+
+
+function %b64_bconst_false() -> b64 {
+block0:
+    v1 = bconst.b64 false
+    return v1
+}
+; run: %b64_bconst_false() == false
+
+; this verifies that returning b64 immediates does not result in a segmentation fault, see https://github.com/bytecodealliance/cranelift/issues/911
+function %b64_bconst_true() -> b64 {
+block0:
+    v1 = bconst.b64 true
+    return v1
+}
+; run: %b64_bconst_true() == true
--- a/cranelift/filetests/filetests/runtests/div-checks.clif
+++ b/cranelift/filetests/filetests/runtests/div-checks.clif
@@ -0,0 +1,35 @@
+test run
+target aarch64
+target arm
+target s390x
+set avoid_div_traps=false
+target x86_64 machinst
+
+function %i8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = srem.i8 v0, v1
+  return v2
+}
+; run: %i8(0x80, 0xff) == 0
+; run: %i8(0x2, 0x7) == 0x2
+
+function %i16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = srem.i16 v0, v1
+  return v2
+}
+; run: %i16(0x8000, 0xffff) == 0
+
+function %i32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = srem.i32 v0, v1
+  return v2
+}
+; run: %i32(0x80000000, 0xffffffff) == 0
+
+function %i64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = srem.i64 v0, v1
+  return v2
+}
+; run: %i32(0x800000000000000, 0xffffffffffffffff) == 0
--- a/cranelift/filetests/filetests/runtests/extend.clif
+++ b/cranelift/filetests/filetests/runtests/extend.clif
@@ -0,0 +1,30 @@
+test run
+target aarch64
+target arm
+target s390x
+; target x86_64 machinst TODO: Not yet implemented on x86_64
+target i686 legacy
+
+function %uextend() -> b1 {
+block0:
+    v0 = iconst.i32 0xffff_ee00
+    v1 = uextend.i64 v0
+    v2, v3 = isplit v1
+    v4 = icmp_imm eq v2, 0xffff_ee00
+    v5 = icmp_imm eq v3, 0
+    v6 = band v4, v5
+    return v6
+}
+; run
+
+function %sextend() -> b1 {
+block0:
+    v0 = iconst.i32 0xffff_ee00
+    v1 = sextend.i64 v0
+    v2, v3 = isplit v1
+    v4 = icmp_imm eq v2, 0xffff_ee00
+    v5 = icmp_imm eq v3, 0xffff_ffff
+    v6 = band v4, v5
+    return v6
+}
+; run
--- a/cranelift/filetests/filetests/runtests/i128-arithmetic-legacy.clif
+++ b/cranelift/filetests/filetests/runtests/i128-arithmetic-legacy.clif
@@ -0,0 +1,20 @@
+test run
+target x86_64 legacy haswell
+
+function %test_imul_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0xf2347ac4503f1e24
+    v12 = iconst.i64 0x0098fe985354ab06
+    v1 = iconcat v11, v12
+    v21 = iconst.i64 0xf606ba453589ef89
+    v22 = iconst.i64 0x042e1f3054ca7432
+    v2 = iconcat v21, v22
+    v31 = iconst.i64 0xbe2044b2742ebd44
+    v32 = iconst.i64 0xa363ce3b6849f307
+    v3 = iconcat v31, v32
+    v4 = imul v1, v2
+    v5 = icmp eq v3, v4
+    return v5
+}
+
+; run
--- a/cranelift/filetests/filetests/runtests/i128-arithmetic.clif
+++ b/cranelift/filetests/filetests/runtests/i128-arithmetic.clif
@@ -0,0 +1,206 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+target x86_64 machinst
+
+; TODO: Cleanup these tests when we have native support for i128 immediates in CLIF's parser
+
+function %add_i128(i64, i64, i64, i64) -> i64, i64 {
+block0(v0: i64,v1: i64,v2: i64,v3: i64):
+    v4 = iconcat v0, v1
+    v5 = iconcat v2, v3
+
+    v6 = iadd v4, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %add_i128(0, 0, 0, 0) == [0, 0]
+; run: %add_i128(0, -1, -1, 0) == [-1, -1]
+; run: %add_i128(1, 0, 0, 0) == [1, 0]
+; run: %add_i128(1, 0, 1, 0) == [2, 0]
+; run: %add_i128(1, 0, -1, -1) == [0, 0]
+; run: %add_i128(-1, 0, 1, 0) == [0, 1]
+
+; run: %add_i128(0x01234567_89ABCDEF, 0x01234567_89ABCDEF, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [-1, -1]
+; run: %add_i128(0x06060606_06060606, 0xA00A00A0_0A00A00A, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x36363636_36363636, 0xABBABBAB_BABBABBA]
+; run: %add_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF]
+
+function %sub_i128(i64, i64, i64, i64) -> i64, i64 {
+block0(v0: i64,v1: i64,v2: i64,v3: i64):
+    v4 = iconcat v0, v1
+    v5 = iconcat v2, v3
+
+    v6 = isub v4, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %sub_i128(0, 0, 0, 0) == [0, 0]
+; run: %sub_i128(1, 0, 1, 0) == [0, 0]
+; run: %sub_i128(1, 0, 0, 0) == [1, 0]
+; run: %sub_i128(0, 0, 1, 0) == [-1, -1]
+; run: %sub_i128(0, 0, -1, -1) == [1, 0]
+
+; run: %sub_i128(-1, -1, 0xFEDCBA98_76543210, 0xFEDCBA98_76543210) == [0x01234567_89ABCDEF, 0x01234567_89ABCDEF]
+; run: %sub_i128(0x36363636_36363636, 0xABBABBAB_BABBABBA, 0x30303030_30303030, 0x0BB0BB0B_B0BB0BB0) == [0x06060606_06060606, 0xA00A00A0_0A00A00A]
+; run: %sub_i128(0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF, 0x1DCB1111_1DCB1111, 0x1DCB1111_1DCB1111) == [0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE]
+
+
+function %mul_i128(i64, i64, i64, i64) -> i64, i64 {
+block0(v0: i64,v1: i64,v2: i64,v3: i64):
+    v4 = iconcat v0, v1
+    v5 = iconcat v2, v3
+
+    v6 = imul v4, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %mul_i128(0, 0, 0, 0) == [0, 0]
+; run: %mul_i128(1, 0, 1, 0) == [1, 0]
+; run: %mul_i128(1, 0, 0, 0) == [0, 0]
+; run: %mul_i128(0, 0, 1, 0) == [0, 0]
+; run: %mul_i128(2, 0, 1, 0) == [2, 0]
+; run: %mul_i128(2, 0, 2, 0) == [4, 0]
+; run: %mul_i128(1, 0, -1, -1) == [-1, -1]
+; run: %mul_i128(2, 0, -1, -1) == [-2, -1]
+
+; run: %mul_i128(0x01010101_01010101, 0x01010101_01010101, 13, 0) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D]
+; run: %mul_i128(13, 0, 0x01010101_01010101, 0x01010101_01010101) == [0x0D0D0D0D_0D0D0D0D, 0x0D0D0D0D_0D0D0D0D]
+; run: %mul_i128(0x00000000_01234567, 0x89ABCDEF_00000000, 0x00000000_FEDCBA98, 0x76543210_00000000) == [0x0121FA00_23E20B28, 0xE2946058_00000000]
+; run: %mul_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF) == [0xDB6B1E48_19BA1112, 0x5ECD38B5_9D1C2B7E]
+; run: %mul_i128(0xC0FFEEEE_C0FFEEEE, 0xC0FFEEEE_C0FFEEEE, 0xDECAFFFF_DECAFFFF, 0xDECAFFFF_DECAFFFF) == [0xDB6B1E48_19BA1112, 0x5ECD38B5_9D1C2B7E]
+
+
+function %ishl_i128_i8(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+
+    v4 = ishl.i128 v3, v2
+
+    v5, v6 = isplit v4
+    return v5, v6
+}
+; run: %ishl_i128_i8(0x01010101_01010101, 0x01010101_01010101, 2) == [0x04040404_04040404, 0x04040404_04040404]
+; run: %ishl_i128_i8(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020200, 0x02020202_02020202]
+; run: %ishl_i128_i8(0x01010101_01010101, 0xffffffff_ffffffff, 66) == [0x00000000_00000000, 0x04040404_04040404]
+; run: %ishl_i128_i8(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ishl_i128_i8(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ishl_i128_i8(0x00000000_00000001, 0x00000000_00000000, 0) == [0x00000000_00000001, 0x00000000_00000000]
+; run: %ishl_i128_i8(0x00000000_00000000, 0x00000000_00000001, 0) == [0x00000000_00000000, 0x00000000_00000001]
+; run: %ishl_i128_i8(0x12340000_00000000, 0x56780000_00000000, 0) == [0x12340000_00000000, 0x56780000_00000000]
+; run: %ishl_i128_i8(0x12340000_00000000, 0x56780000_00000000, 64) == [0x00000000_00000000, 0x12340000_00000000]
+; run: %ishl_i128_i8(0x12340000_00000000, 0x56780000_00000000, 32) == [0x00000000_00000000, 0x00000000_12340000]
+; run: %ishl_i128_i8(0x01010101_01010101, 0x01010101_01010101, 129) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %ishl_i128_i8(0x01010101_01010101, 0x01010101_01010101, 130) == [0x04040404_04040404, 0x04040404_04040404]
+
+function %ishl_i128_i128(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = uextend.i64 v2
+    v5 = iconcat v4, v4
+
+    v6 = ishl.i128 v3, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %ishl_i128_i128(0x01010101_01010101, 0x01010101_01010101, 2) == [0x04040404_04040404, 0x04040404_04040404]
+; run: %ishl_i128_i128(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020200, 0x02020202_02020202]
+; run: %ishl_i128_i128(0x01010101_01010101, 0xffffffff_ffffffff, 66) == [0x00000000_00000000, 0x04040404_04040404]
+; run: %ishl_i128_i128(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ishl_i128_i128(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ishl_i128_i128(0x00000000_00000001, 0x00000000_00000000, 0) == [0x00000000_00000001, 0x00000000_00000000]
+; run: %ishl_i128_i128(0x00000000_00000000, 0x00000000_00000001, 0) == [0x00000000_00000000, 0x00000000_00000001]
+; run: %ishl_i128_i128(0x12340000_00000000, 0x56780000_00000000, 0) == [0x12340000_00000000, 0x56780000_00000000]
+; run: %ishl_i128_i128(0x12340000_00000000, 0x56780000_00000000, 64) == [0x00000000_00000000, 0x12340000_00000000]
+; run: %ishl_i128_i128(0x12340000_00000000, 0x56780000_00000000, 32) == [0x00000000_00000000, 0x00000000_12340000]
+; run: %ishl_i128_i128(0x01010101_01010101, 0x01010101_01010101, 129) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %ishl_i128_i128(0x01010101_01010101, 0x01010101_01010101, 130) == [0x04040404_04040404, 0x04040404_04040404]
+
+
+function %ushr_i128_i8(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+
+    v4 = ushr.i128 v3, v2
+
+    v5, v6 = isplit v4
+    return v5, v6
+}
+; run: %ushr_i128_i8(0x01010101_01010101, 0x01010101_01010101, 2) == [0x40404040_40404040, 0x00404040_40404040]
+; run: %ushr_i128_i8(0x01010101_01010101, 0x01010101_01010101, 66) == [0x00404040_40404040, 0x00000000_00000000]
+; run: %ushr_i128_i8(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ushr_i128_i8(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ushr_i128_i8(0x00000000_00000001, 0x00000000_00000000, 0) == [0x00000000_00000001, 0x00000000_00000000]
+; run: %ushr_i128_i8(0x00000000_00000000, 0x00000000_00000001, 0) == [0x00000000_00000000, 0x00000000_00000001]
+; run: %ushr_i128_i8(0x12340000_00000000, 0x56780000_00000000, 0) == [0x12340000_00000000, 0x56780000_00000000]
+; run: %ushr_i128_i8(0x12340000_00000000, 0x56780000_00000000, 64) == [0x56780000_00000000, 0x00000000_00000000]
+; run: %ushr_i128_i8(0x12340000_00000000, 0x56780000_00000000, 32) == [0x00000000_12340000, 0x00000000_56780000]
+; run: %ushr_i128_i8(0x01010101_01010101, 0x01010101_01010101, 129) == [0x80808080_80808080, 0x00808080_80808080]
+; run: %ushr_i128_i8(0x01010101_01010101, 0x01010101_01010101, 130) == [0x40404040_40404040, 0x00404040_40404040]
+
+function %ushr_i128_i128(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = uextend.i64 v2
+    v5 = iconcat v4, v4
+
+    v6 = ushr.i128 v3, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %ushr_i128_i128(0x01010101_01010101, 0x01010101_01010101, 2) == [0x40404040_40404040, 0x00404040_40404040]
+; run: %ushr_i128_i128(0x01010101_01010101, 0x01010101_01010101, 66) == [0x00404040_40404040, 0x00000000_00000000]
+; run: %ushr_i128_i128(0x01010101_01010101, 0x01010101_01010101, 0) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ushr_i128_i128(0x01010101_01010101, 0x01010101_01010101, 128) == [0x01010101_01010101, 0x01010101_01010101]
+; run: %ushr_i128_i128(0x00000000_00000001, 0x00000000_00000000, 0) == [0x00000000_00000001, 0x00000000_00000000]
+; run: %ushr_i128_i128(0x00000000_00000000, 0x00000000_00000001, 0) == [0x00000000_00000000, 0x00000000_00000001]
+; run: %ushr_i128_i128(0x12340000_00000000, 0x56780000_00000000, 0) == [0x12340000_00000000, 0x56780000_00000000]
+; run: %ushr_i128_i128(0x12340000_00000000, 0x56780000_00000000, 64) == [0x56780000_00000000, 0x00000000_00000000]
+; run: %ushr_i128_i128(0x12340000_00000000, 0x56780000_00000000, 32) == [0x00000000_12340000, 0x00000000_56780000]
+; run: %ushr_i128_i128(0x01010101_01010101, 0x01010101_01010101, 129) == [0x80808080_80808080, 0x00808080_80808080]
+; run: %ushr_i128_i128(0x01010101_01010101, 0x01010101_01010101, 130) == [0x40404040_40404040, 0x00404040_40404040]
+
+
+function %sshr_i128_i8(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+
+    v4 = sshr.i128 v3, v2
+
+    v5, v6 = isplit v4
+    return v5, v6
+}
+; run: %sshr_i128_i8(0x01010101_01010101, 0x81010101_01010101, 2) == [0x40404040_40404040, 0xe0404040_40404040]
+; run: %sshr_i128_i8(0x00000000_00000000, 0xffffffff_ffffffff, 32) == [0xffffffff_00000000, 0xffffffff_ffffffff]
+; run: %sshr_i128_i8(0x80000000_00000000, 0xffffffff_00000000, 32) == [0x00000000_80000000, 0xffffffff_ffffffff]
+; run: %sshr_i128_i8(0x12345678_9abcdef0, 0x80101010_10101010, 66) == [0xe0040404_04040404, 0xffffffff_ffffffff]
+; run: %sshr_i128_i8(0x00000000_00000000, 0x00000000_00000000, 64) == [0x00000000_00000000, 0x00000000_00000000]
+; run: %sshr_i128_i8(0x12345678_9abcdef0, 0x80101010_10101010, 0) == [0x12345678_9abcdef0, 0x80101010_10101010]
+; run: %sshr_i128_i8(0x12345678_9abcdef0, 0x80101010_10101010, 128) == [0x12345678_9abcdef0, 0x80101010_10101010]
+; run: %sshr_i128_i8(0x01010101_01010101, 0x81010101_01010101, 129) == [0x80808080_80808080, 0xc0808080_80808080]
+; run: %sshr_i128_i8(0x01010101_01010101, 0x81010101_01010101, 130) == [0x40404040_40404040, 0xe0404040_40404040]
+
+function %sshr_i128_i128(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = uextend.i64 v2
+    v5 = iconcat v4, v4
+
+    v6 = sshr.i128 v3, v5
+
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %sshr_i128_i128(0x01010101_01010101, 0x81010101_01010101, 2) == [0x40404040_40404040, 0xe0404040_40404040]
+; run: %sshr_i128_i128(0x00000000_00000000, 0xffffffff_ffffffff, 32) == [0xffffffff_00000000, 0xffffffff_ffffffff]
+; run: %sshr_i128_i128(0x80000000_00000000, 0xffffffff_00000000, 32) == [0x00000000_80000000, 0xffffffff_ffffffff]
+; run: %sshr_i128_i128(0x12345678_9abcdef0, 0x80101010_10101010, 66) == [0xe0040404_04040404, 0xffffffff_ffffffff]
+; run: %sshr_i128_i128(0x00000000_00000000, 0x00000000_00000000, 64) == [0x00000000_00000000, 0x00000000_00000000]
+; run: %sshr_i128_i128(0x12345678_9abcdef0, 0x80101010_10101010, 0) == [0x12345678_9abcdef0, 0x80101010_10101010]
+; run: %sshr_i128_i128(0x12345678_9abcdef0, 0x80101010_10101010, 128) == [0x12345678_9abcdef0, 0x80101010_10101010]
+; run: %sshr_i128_i128(0x01010101_01010101, 0x81010101_01010101, 129) == [0x80808080_80808080, 0xc0808080_80808080]
+; run: %sshr_i128_i128(0x01010101_01010101, 0x81010101_01010101, 130) == [0x40404040_40404040, 0xe0404040_40404040]
--- a/cranelift/filetests/filetests/runtests/i128-bitops-misc.clif
+++ b/cranelift/filetests/filetests/runtests/i128-bitops-misc.clif
@@ -0,0 +1,27 @@
+test run
+; target s390x TODO: Not yet implemented on s390x
+target x86_64 machinst
+
+function %ctz(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+    v2 = iconcat v0, v1
+    v3 = ctz.i128 v2
+    v4 = ireduce.i8 v3
+    return v4
+}
+; run: %ctz(0x00000000_00000000, 0x00000001_00000000) == 96
+; run: %ctz(0x00000000_00010000, 0x00000001_00000000) == 16
+; run: %ctz(0x00000000_00010000, 0x00000000_00000000) == 16
+; run: %ctz(0x00000000_00000000, 0x00000000_00000000) == 128
+
+function %clz(i64, i64) -> i8 {
+block0(v0: i64, v1: i64):
+    v2 = iconcat v0, v1
+    v3 = clz.i128 v2
+    v4 = ireduce.i8 v3
+    return v4
+}
+; run: %clz(0x00000000_00000000, 0x00000001_00000000) == 31
+; run: %clz(0x00000000_00010000, 0x00000001_00000000) == 31
+; run: %clz(0x00000000_00010000, 0x00000000_00000000) == 111
+; run: %clz(0x00000000_00000000, 0x00000000_00000000) == 128
--- a/cranelift/filetests/filetests/runtests/i128-bitrev.clif
+++ b/cranelift/filetests/filetests/runtests/i128-bitrev.clif
@@ -0,0 +1,47 @@
+test run
+target x86_64 machinst
+target x86_64 legacy
+
+function %reverse_bits_zero() -> b1 {
+block0:
+    v0 = iconst.i64 0
+    v1 = iconcat v0, v0
+    v2 = bitrev.i128 v1
+    v3 = icmp eq v2, v1
+    return v3
+}
+; run
+
+function %reverse_bits_one() -> b1 {
+block0:
+    v0 = iconst.i64 0
+    v1 = iconst.i64 1
+    v2 = iconcat v0, v1
+
+    v3 = bitrev.i128 v2
+
+    v4 = iconst.i64 0x8000_0000_0000_0000
+    v5 = iconst.i64 0
+    v6 = iconcat v4, v5
+
+    v7 = icmp eq v3, v6
+    return v7
+}
+; run
+
+function %reverse_bits() -> b1 {
+block0:
+    v0 = iconst.i64 0x06AD_8667_69EC_41BA
+    v1 = iconst.i64 0x6C83_D81A_6E28_83AB
+    v2 = iconcat v0, v1
+
+    v3 = bitrev.i128 v2
+
+    v4 = iconst.i64 0xD5C11476581BC136
+    v5 = iconst.i64 0x5D823796E661B560
+    v6 = iconcat v4, v5
+
+    v7 = icmp eq v3, v6
+    return v7
+}
+; run
--- a/cranelift/filetests/filetests/runtests/i128-br.clif
+++ b/cranelift/filetests/filetests/runtests/i128-br.clif
@@ -0,0 +1,42 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+target x86_64 machinst
+target x86_64 legacy
+
+
+function %br_false() -> b1 {
+block0:
+    v10 = iconst.i64 0x42
+    v11 = iconst.i64 0x00
+    v0 = iconcat v10, v11
+    brz v0, block2
+    jump block1
+
+block1:
+    v1 = bconst.b1 true
+    return v1
+
+block2:
+    v2 = bconst.b1 false
+    return v2
+}
+; run
+
+function %br_true() -> b1 {
+block0:
+    v10 = iconst.i64 0x00
+    v11 = iconst.i64 0x00
+    v0 = iconcat v10, v11
+    brz v0, block2
+    jump block1
+
+block1:
+    v1 = bconst.b1 false
+    return v1
+
+block2:
+    v2 = bconst.b1 true
+    return v2
+}
+; run
--- a/cranelift/filetests/filetests/runtests/i128-const.clif
+++ b/cranelift/filetests/filetests/runtests/i128-const.clif
@@ -0,0 +1,12 @@
+test run
+target aarch64
+; target s390x TODO: Not yet implemented on s390x
+target x86_64 machinst
+
+function %i128_const_0() -> i64, i64 {
+block0:
+    v1 = iconst.i128 0
+    v2, v3 = isplit v1
+    return v2, v3
+}
+; run: %i128_const_0() == [0, 0]
--- a/cranelift/filetests/filetests/runtests/i128-extend.clif
+++ b/cranelift/filetests/filetests/runtests/i128-extend.clif
@@ -0,0 +1,29 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+target x86_64 machinst
+target x86_64 legacy
+
+function %i128_uextend() -> b1 {
+block0:
+    v0 = iconst.i64 0xffff_ffff_eeee_0000
+    v1 = uextend.i128 v0
+    v2, v3 = isplit v1
+    v4 = icmp_imm eq v2, 0xffff_ffff_eeee_0000
+    v5 = icmp_imm eq v3, 0
+    v6 = band v4, v5
+    return v6
+}
+; run
+
+function %i128_sextend() -> b1 {
+block0:
+    v0 = iconst.i64 0xffff_ffff_eeee_0000
+    v1 = sextend.i128 v0
+    v2, v3 = isplit v1
+    v4 = icmp_imm eq v2, 0xffff_ffff_eeee_0000
+    v5 = icmp_imm eq v3, 0xffff_ffff_ffff_ffff
+    v6 = band v4, v5
+    return v6
+}
+; run
--- a/cranelift/filetests/filetests/runtests/i128-icmp.clif
+++ b/cranelift/filetests/filetests/runtests/i128-icmp.clif
@@ -0,0 +1,94 @@
+test run
+target x86_64 machinst
+
+function %test_icmp_eq_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x0
+    v12 = iconst.i64 0x0
+    v1 = iconcat v11, v12
+    v21 = iconst.i64 0x0
+    v22 = iconst.i64 0x0
+    v2 = iconcat v21, v22
+    v10 = icmp.i128 eq v1, v2
+    return v10
+}
+
+; run
+
+function %test_icmp_imm_eq_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x0
+    v12 = iconst.i64 0x0
+    v1 = iconcat v11, v12
+    v10 = icmp_imm.i128 eq v1, 0x0
+    return v10
+}
+
+; run
+
+function %test_icmp_ne_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x0
+    v12 = iconst.i64 0x0
+    v1 = iconcat v11, v12
+    v21 = iconst.i64 0x0
+    v22 = iconst.i64 0x1
+    v2 = iconcat v21, v22
+    v10 = icmp.i128 ne v1, v2
+    return v10
+}
+
+; run
+
+function %test_icmp_imm_ne_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x0
+    v12 = iconst.i64 0x0
+    v1 = iconcat v11, v12
+    v10 = icmp_imm.i128 ne v1, 0x1
+    return v10
+}
+
+; run
+
+function %test_icmp_nz_eq_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x1
+    v12 = iconst.i64 0x1
+    v1 = iconcat v11, v12
+    v21 = iconst.i64 0x1
+    v22 = iconst.i64 0x1
+    v2 = iconcat v21, v22
+    v10 = icmp.i128 eq v1, v2
+    return v10
+}
+
+; run
+
+function %test_icmp_nz_gt_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x1
+    v12 = iconst.i64 0x1
+    v1 = iconcat v11, v12
+    v21 = iconst.i64 0x1
+    v22 = iconst.i64 0x2
+    v2 = iconcat v21, v22
+    v10 = icmp.i128 ugt v2, v1
+    return v10
+}
+
+; run
+
+function %test_icmp_nz_ge_i128() -> b1 {
+block0:
+    v11 = iconst.i64 0x1
+    v12 = iconst.i64 0x1
+    v1 = iconcat v11, v12
+    v21 = iconst.i64 0x1
+    v22 = iconst.i64 0x1
+    v2 = iconcat v21, v22
+    v10 = icmp.i128 uge v1, v2
+    return v10
+}
+
+; run
--- a/cranelift/filetests/filetests/runtests/i128-rotate.clif
+++ b/cranelift/filetests/filetests/runtests/i128-rotate.clif
@@ -0,0 +1,60 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+target x86_64 machinst
+
+function %rotl(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = rotl.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
+}
+; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %rotl(0x01010101_01010101, 0x01010101_01010101, 73) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %rotl(0x01010101_01010101, 0x02020202_02020202, 0) == [0x01010101_01010101, 0x02020202_02020202]
+; run: %rotl(0x01010101_01010101, 0x03030303_03030303, 128) == [0x01010101_01010101, 0x03030303_03030303]
+
+function %rotr(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = iconcat v0, v1
+    v4 = rotr.i128 v3, v2
+    v5, v6 = isplit v4
+    return v5, v6
+}
+; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 9) == [0x80808080_80808080, 0x80808080_80808080]
+; run: %rotr(0x01010101_01010101, 0x01010101_01010101, 73) == [0x80808080_80808080, 0x80808080_80808080]
+; run: %rotr(0x01010101_01010101, 0x02020202_02020202, 0) == [0x01010101_01010101, 0x02020202_02020202]
+; run: %rotr(0x01010101_01010101, 0x03030303_03030303, 128) == [0x01010101_01010101, 0x03030303_03030303]
+
+function %rotl_amt_i128(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = uextend.i64 v2
+    v4 = iconcat v3, v3
+
+    v5 = iconcat v0, v1
+
+    v6 = rotl.i128 v5, v4
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %rotl_amt_i128(0x01010101_01010101, 0x01010101_01010101, 9) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %rotl_amt_i128(0x01010101_01010101, 0x01010101_01010101, 73) == [0x02020202_02020202, 0x02020202_02020202]
+; run: %rotl_amt_i128(0x01010101_01010101, 0x02020202_02020202, 0) == [0x01010101_01010101, 0x02020202_02020202]
+; run: %rotl_amt_i128(0x01010101_01010101, 0x03030303_03030303, 128) == [0x01010101_01010101, 0x03030303_03030303]
+
+function %rotr_amt_i128(i64, i64, i8) -> i64, i64 {
+block0(v0: i64, v1: i64, v2: i8):
+    v3 = uextend.i64 v2
+    v4 = iconcat v3, v3
+
+    v5 = iconcat v0, v1
+
+    v6 = rotr.i128 v5, v4
+    v7, v8 = isplit v6
+    return v7, v8
+}
+; run: %rotr_amt_i128(0x01010101_01010101, 0x01010101_01010101, 9) == [0x80808080_80808080, 0x80808080_80808080]
+; run: %rotr_amt_i128(0x01010101_01010101, 0x01010101_01010101, 73) == [0x80808080_80808080, 0x80808080_80808080]
+; run: %rotr_amt_i128(0x01010101_01010101, 0x02020202_02020202, 0) == [0x01010101_01010101, 0x02020202_02020202]
+; run: %rotr_amt_i128(0x01010101_01010101, 0x03030303_03030303, 128) == [0x01010101_01010101, 0x03030303_03030303]
--- a/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
+++ b/cranelift/filetests/filetests/runtests/simd-arithmetic.clif
@@ -0,0 +1,193 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+set enable_simd
+target x86_64 machinst skylake
+set enable_simd
+target x86_64 legacy skylake
+
+function %iadd_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0:i32x4, v1:i32x4):
+    v2 = iadd v0, v1
+    return v2
+}
+; run: %iadd_i32x4([1 1 1 1], [1 2 3 4]) == [2 3 4 5]
+
+function %iadd_i8x16_with_overflow() -> i8x16 {
+block0:
+    v0 = vconst.i8x16 [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255]
+    v1 = vconst.i8x16 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
+    v2 = iadd v0, v1
+    return v2
+}
+; run: %iadd_i8x16_with_overflow() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %isub_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = isub v0, v1
+    return v2
+}
+; run: %isub_i32x4([1 1 1 1], [1 2 3 4]) == [0 -1 -2 -3]
+
+function %ineg_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = ineg v0
+    return v1
+}
+; run: %ineg_i32x4([1 1 1 1]) == [-1 -1 -1 -1]
+
+function %imul_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = imul v0, v1
+    return v2
+}
+; run: %imul_i64x2([0 2], [0 2]) == [0 4]
+
+function %imul_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = imul v0, v1
+    return v2
+}
+; run: %imul_i32x4([-1 0 1 0x80_00_00_01], [2 2 2 2]) == [-2 0 2 2]
+; Note above how bits are truncated: 0x80_00_00_01 * 2 == 0x1_00_00_00_02, but
+; the leading 1 is dropped.
+
+function %imul_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = imul v0, v1
+    return v2
+}
+; run: %imul_i16x8([-1 0 1 0x7f_ff 0 0 0 0], [2 2 2 2 0 0 0 0]) == [-2 0 2 0xff_fe 0 0 0 0]
+
+function %sadd_sat_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = sadd_sat v0, v1
+    return v2
+}
+; run: %sadd_sat_i8x16([0x7f 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [0x7f 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %uadd_sat_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = uadd_sat v0, v1
+    return v2
+}
+; run: %uadd_sat_i16x8([-1 0 0 0 0 0 0 0], [-1 1 1 1 1 1 1 1]) == [65535 1 1 1 1 1 1 1]
+
+function %ssub_sat_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = ssub_sat v0, v1
+    return v2
+}
+; run: %ssub_sat_i8x16([0x80 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [0x80 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff]
+; Note that 0x80 == -128 and subtracting 1 from that should saturate.
+
+function %usub_sat_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = usub_sat v0, v1
+    return v2
+}
+; run: %usub_sat_i8x16([0x80 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]) == [0x7f 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+
+function %add_sub_f32x4() -> b1 {
+block0:
+    v0 = vconst.f32x4 [0x4.2 0.0 0.0 0.0]
+    v1 = vconst.f32x4 [0x1.0 0x1.0 0x1.0 0x1.0]
+    v2 = vconst.f32x4 [0x5.2 0x1.0 0x1.0 0x1.0]
+
+    v3 = fadd v0, v1
+    v4 = fcmp eq v3, v2
+
+    v6 = fsub v2, v1
+    v7 = fcmp eq v6, v0
+
+    v8 = band v4, v7
+    v9 = vall_true v8
+    return v9
+}
+; run
+
+function %mul_div_f32x4() -> b1 {
+block0:
+    v0 = vconst.f32x4 [0x4.2 -0x2.1 0x2.0 0.0]
+    v1 = vconst.f32x4 [0x3.4 0x6.7 0x8.9 0xa.b]
+    v2 = vconst.f32x4 [0xd.68 -0xd.47 0x11.2 0x0.0]
+
+    v3 = fmul v0, v1
+    v4 = fcmp eq v3, v2
+
+    v6 = fdiv v2, v1
+    v7 = fcmp eq v6, v0
+
+    v8 = band v4, v7
+    v9 = vall_true v8
+    return v9
+}
+; run
+
+function %sqrt_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2): 
+    v1 = sqrt v0
+    return v1
+}
+; run: %sqrt_f64x2([0x9.0 0x1.0]) == [0x3.0 0x1.0]
+
+function %fmax_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+    v2 = fmax v0, v1
+    return v2
+}
+; note below how NaNs are quieted but (unlike fmin), retain their sign: this discrepancy is allowed by non-determinism
+; in the spec, see https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0.
+; run: %fmax_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [+0x0.0 0x1.0]
+; run: %fmax_f64x2([-NaN NaN], [0x0.0 0x100.0]) == [-NaN NaN]
+; run: %fmax_f64x2([NaN 0.0], [0.0 0.0]) == [NaN 0.0]
+; run: %fmax_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
+; run: %fmax_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [NaN 0.0]
+
+function %fmin_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+    v2 = fmin v0, v1
+    return v2
+}
+; note below how NaNs are quieted and negative: this is due to non-determinism in the spec for NaNs, see
+; https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0.
+; run: %fmin_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [-0x0.0 -0x1.0]
+; run: %fmin_f64x2([-NaN 0x100.0], [0.0 NaN]) == [-NaN -NaN]
+; run: %fmin_f64x2([NaN 0.0], [0.0 0.0]) == [-NaN 0.0]
+; run: %fmin_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
+; run: %fmin_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [-NaN 0.0]
+
+function %fneg_f64x2(f64x2) -> f64x2 {
+block0(v0: f64x2):
+    v1 = fneg v0
+    return v1
+}
+; run: %fneg_f64x2([0x1.0 -0x1.0]) == [-0x1.0 0x1.0]
+
+function %fneg_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+    v1 = fneg v0
+    return v1
+}
+; run: %fneg_f32x4([0x0.0 -0x0.0 -Inf Inf]) == [-0x0.0 0x0.0 Inf -Inf]
+
+function %fabs_f32x4(f32x4) -> f32x4 {
+block0(v0: f32x4):
+    v1 = fabs v0
+    return v1
+}
+; run: %fabs_f32x4([0x0.0 -0x1.0 0x2.0 -0x3.0]) == [0x0.0 0x1.0 0x2.0 0x3.0]
+
+function %average_rounding_i16x8(i16x8, i16x8) -> i16x8 {
+block0(v0: i16x8, v1: i16x8):
+    v2 = avg_round v0, v1
+    return v2
+}
+; run: %average_rounding_i16x8([0 0 0 1 42 19 -1 0xffff], [0 1 2 4 42 18 -1 0]) == [0 1 1 3 42 19 -1 0x8000]
+
+function %iabs(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = iabs v0
+    return v1
+}
+; run: %iabs([-42 -1 0 1]) == [42 1 0 1]
--- a/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitselect-to-vselect.clif
@@ -0,0 +1,44 @@
+test run
+target aarch64
+; target s390x TODO: Not yet implemented on s390x
+set opt_level=speed_and_size
+set enable_simd
+target x86_64 machinst skylake
+set opt_level=speed_and_size
+set enable_simd
+target x86_64 legacy haswell
+
+;; x86_64 legacy: Test if bitselect->vselect optimization works properly
+
+function %mask_from_icmp(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = icmp sge v0, v1
+    v3 = raw_bitcast.i32x4 v2
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+; run: %mask_from_icmp([5 6 7 8], [1 10 20 7]) == [5 10 20 8]
+
+function %mask_casted(i64x2, i64x2, i32x4) -> i64x2 {
+block0(v0: i64x2, v1: i64x2, v2: i32x4):
+    v3 = raw_bitcast.i64x2 v2
+    v4 = bitselect v3, v0, v1
+    return v4
+}
+; run: %mask_casted([0 0], [0xFFFFFF 0xFFFF4F], [0xFFF1 0 0xF 0]) == [0xFF000E 0xFFFF40]
+
+function %good_const_mask(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF00FF 0xFF00FFFF]
+    v4 = bitselect v2, v0, v1
+    return v4
+}
+; run: %good_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAA34 0x5678]
+
+function %bad_const_mask(i32x4, i32x4) -> i32x4 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = vconst.i32x4 [0x0000FF00 0x00FF00FF 0x00FF000F 0xFF00FFF0]
+    v4 = bitselect v2, v0, v1
+    return v4
+}
+; run: %bad_const_mask([0x1234 0x5678 0x1234 0x5678], [0xAAAA 0xAAAA 0xAAAA 0xAAAA]) == [0x12AA 0xAA78 0xAAA4 0x567A]
--- a/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitwise-run.clif
@@ -0,0 +1,214 @@
+test run
+set enable_simd
+target x86_64 legacy skylake
+
+; TODO: once available, replace all lane extraction with `icmp + all_ones`
+
+function %ishl_i32x4() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i32x4 [1 2 4 8]
+    v2 = ishl v1, v0
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 2
+
+    v5 = extractlane v2, 3
+    v6 = icmp_imm eq v5, 16
+
+    v7 = band v4, v6
+    return v7
+}
+; run
+
+function %ishl_too_large_i16x8() -> b1 {
+block0:
+    v0 = iconst.i32 17 ; note that this will shift off the end of each lane
+    v1 = vconst.i16x8 [1 2 4 8 16 32 64 128]
+    v2 = ishl v1, v0
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0
+
+    v5 = extractlane v2, 3
+    v6 = icmp_imm eq v5, 0
+
+    v7 = band v4, v6
+    return v7
+}
+; run
+
+function %ushr_i8x16() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ushr v1, v0
+
+    v3 = vconst.i8x16 [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %sshr_i8x16() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1]
+    v2 = sshr v1, v0
+
+    v3 = vconst.i8x16 [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %ishl_i8x16() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = ishl v1, v0
+
+    v3 = vconst.i8x16 [0 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %ushr_i64x2() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i64x2 [1 2]
+    v2 = ushr v1, v0
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0
+
+    v5 = extractlane v2, 1
+    v6 = icmp_imm eq v5, 1
+
+    v7 = band v4, v6
+    return v7
+}
+; run
+
+function %ushr_too_large_i32x4() -> b1 {
+block0:
+    v0 = iconst.i32 33 ; note that this will shift off the end of each lane
+    v1 = vconst.i32x4 [1 2 4 8]
+    v2 = ushr v1, v0
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0
+
+    v5 = extractlane v2, 3
+    v6 = icmp_imm eq v5, 0
+
+    v7 = band v4, v6
+    return v7
+}
+; run
+
+function %sshr_i16x8() -> b1 {
+block0:
+    v0 = iconst.i32 1
+    v1 = vconst.i16x8 [-1 2 4 8 -16 32 64 128]
+    v2 = sshr v1, v0
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0xffff ; because of the shifted-in sign-bit, this remains 0xffff == -1
+
+    v5 = extractlane v2, 4
+    v6 = icmp_imm eq v5, 0xfff8 ; -16 has been shifted to -8 == 0xfff8
+
+    v7 = band v4, v6
+    return v7
+}
+; run
+
+function %sshr_too_large_i32x4() -> b1 {
+block0:
+    v0 = iconst.i32 33 ; note that this will shift off the end of each lane
+    v1 = vconst.i32x4 [1 2 4 -8]
+    v2 = sshr v1, v0
+
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 0
+
+    v5 = extractlane v2, 3
+    v6 = icmp_imm eq v5, 0xffff_ffff ; shifting in the sign-bit repeatedly fills the result with 1s
+
+    v7 = band v4, v6
+    return v7
+}
+; run
+
+function %sshr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0:i64x2, v1:i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i64x2([1 -1], 0) == [1 -1]
+; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
+; run: %sshr_i64x2([2 -2], 1) == [1 -1]
+; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
+
+function %bitselect_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255]  ; the selector vector
+    v1 = vconst.i8x16 [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42] ; for each 1-bit in v0 the bit of v1 is selected
+    v2 = vconst.i8x16 [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127] ; for each 0-bit in v0 the bit of v2 is selected
+    v3 = bitselect v0, v1, v2
+
+    v4 = extractlane v3, 0
+    v5 = icmp_imm eq v4, 42
+
+    v6 = extractlane v3, 1
+    v7 = icmp_imm eq v6, 0
+
+    v8 = extractlane v3, 15
+    v9 = icmp_imm eq v8, 42
+
+    v10 = band v5, v7
+    v11 = band v10, v9
+    return v11
+}
+; run
+
+function %sshr_imm_i32x4() -> b1 {
+block0:
+    v1 = vconst.i32x4 [1 2 4 -8]
+    v2 = sshr_imm v1, 1
+
+    v3 = vconst.i32x4 [0 1 2 -4]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %sshr_imm_i16x8() -> b1 {
+block0:
+    v1 = vconst.i16x8 [1 2 4 -8 0 0 0 0]
+    v2 = ushr_imm v1, 1
+
+    v3 = vconst.i16x8 [0 1 2 32764 0 0 0 0] ; -4 with MSB unset == 32764
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %ishl_imm_i64x2() -> b1 {
+block0:
+    v1 = vconst.i64x2 [1 0]
+    v2 = ishl_imm v1, 1
+
+    v3 = vconst.i64x2 [2 0]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-bitwise.clif
+++ b/cranelift/filetests/filetests/runtests/simd-bitwise.clif
@@ -0,0 +1,132 @@
+test run
+target aarch64
+; target s390x TODO: Not yet implemented on s390x
+set enable_simd
+target x86_64 machinst skylake
+
+function %bitselect_i8x16(i8x16, i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16, v2: i8x16):
+    v3 = bitselect v0, v1, v2
+    return v3
+}
+; Remember that bitselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
+; run: %bitselect_i8x16([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 255], [127 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42], [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 127]) == [42 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+
+function %vselect_i32x4(i32x4, i32x4) -> i32x4 {
+block0(v1: i32x4, v2: i32x4):
+    ; `make_trampoline` still does not know how to convert boolean vector types
+    ; so we load the value directly here.
+    v0 = vconst.b32x4 [true true false false]
+    v3 = vselect v0, v1, v2
+    return v3
+}
+; Remember that vselect accepts: 1) the selector vector, 2) the "if true" vector, and 3) the "if false" vector.
+; run: %vselect_i8x16([1 2 -1 -1], [-1 -1 3 4]) == [1 2 3 4]
+
+
+
+; shift left
+
+function %ishl_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 4) == [0x00 0x10 0x20 0x30 0x40 0x50 0x60 0x70 0x80 0x90 0xa0 0xb0 0xc0 0xd0 0xe0 0xf0]
+
+function %ishl_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i16x8([1 2 4 8 16 32 64 128], 17) == [0 0 0 0 0 0 0 0]
+
+function %ishl_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = ishl v0, v1
+    return v2
+}
+; run: %ishl_i32x4([1 2 4 8], 1) == [2 4 8 16]
+
+function %ishl_imm_i64x2(i64x2) -> i64x2 {
+block0(v0: i64x2):
+    v2 = ishl_imm v0, 1
+    return v2
+}
+; run: %ishl_imm_i64x2([1 0]) == [2 0]
+
+
+
+; shift right (logical)
+
+function %ushr_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i8x16([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], 1) == [0 0 1 1 2 2 3 3 4 4 5 5 6 6 7 7]
+
+function %ushr_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i32x4([1 2 4 8], 33) == [0 0 0 0]
+
+function %ushr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0: i64x2, v1: i32):
+    v2 = ushr v0, v1
+    return v2
+}
+; run: %ushr_i64x2([1 2], 1) == [0 1]
+
+
+
+; shift right (arithmetic)
+
+function %sshr_i8x16(i8x16, i32) -> i8x16 {
+block0(v0: i8x16, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i8x16([0 0xff 2 0xfd 4 0xfb 6 0xf9 8 0xf7 10 0xf5 12 0xf3 14 0xf1], 1) == [0 0xff 1 0xfe 2 0xfd 3 0xfc 4 0xfb 5 0xfa 6 0xf9 7 0xf8]
+
+function %sshr_i16x8(i16x8, i32) -> i16x8 {
+block0(v0: i16x8, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; note: because of the shifted-in sign-bit, lane 0 remains -1 == 0xffff, whereas lane 4 has been shifted to -8 == 0xfff8
+; run: %ushr_i16x8([-1 2 4 8 -16 32 64 128], 1) == [-1 1 2 4 -8 16 32 64]
+
+function %sshr_i32x4(i32x4, i32) -> i32x4 {
+block0(v0: i32x4, v1: i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; note: shifting in the sign-bit repeatedly in lane 3 fills the result with 1s (-1 == 0xffff_ffff)
+; run: %ushr_i32x4([1 2 4 -8], 33) == [0 0 0 0xffff_ffff]
+
+function %sshr_i64x2(i64x2, i32) -> i64x2 {
+block0(v0:i64x2, v1:i32):
+    v2 = sshr v0, v1
+    return v2
+}
+; run: %sshr_i64x2([1 -1], 0) == [1 -1]
+; run: %sshr_i64x2([1 -1], 1) == [0 -1] ; note the -1 shift result
+; run: %sshr_i64x2([2 -2], 1) == [1 -1]
+; run: %sshr_i64x2([0x80000000_00000000 0x7FFFFFFF_FFFFFFFF], 63) == [0xFFFFFFFF_FFFFFFFF 0]
+
+function %sshr_imm_i32x4(i32x4) -> i32x4 {
+block0(v0: i32x4):
+    v1 = sshr_imm v0, 1
+    return v1
+}
+; run: %sshr_imm_i32x4([1 2 4 -8]) == [0 1 2 -4]
+
+function %sshr_imm_i16x8(i16x8) -> i16x8 {
+block0(v0: i16x8):
+    v1 = sshr_imm v0, 1
+    return v1
+}
+; run: %sshr_imm_i16x8([1 2 4 -8 0 0 0 0]) == [0 1 2 -4 0 0 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-comparison-legacy.clif
+++ b/cranelift/filetests/filetests/runtests/simd-comparison-legacy.clif
@@ -0,0 +1,44 @@
+test run
+set enable_simd
+target x86_64 legacy
+
+function %maxs_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] ; 1 will be greater than -1 == 0xff with
+    ; signed max
+    v1 = vconst.i8x16 [0xff 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+    v2 = x86_pmaxs v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %maxu_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [0 1 1 1 1 1 1 1]
+    v1 = vconst.i16x8 [-1 1 1 1 1 1 1 1] ; -1 == 0xff will be greater with unsigned max
+    v2 = x86_pmaxu v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %mins_i32x4() -> b1 {
+block0:
+    v0 = vconst.i32x4 [0 1 1 1]
+    v1 = vconst.i32x4 [-1 1 1 1] ; -1 == 0xff will be less with signed min
+    v2 = x86_pmins v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %minu_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1] ; 1 < 2 with unsiged min
+    v1 = vconst.i8x16 [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
+    v2 = x86_pminu v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-comparison.clif
+++ b/cranelift/filetests/filetests/runtests/simd-comparison.clif
@@ -0,0 +1,210 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+set enable_simd
+target x86_64 machinst
+set enable_simd
+target x86_64 legacy
+
+function %icmp_eq_i8x16() -> b8 {
+block0:
+    v0 = vconst.i8x16 0x00
+    v1 = vconst.i8x16 0x00
+    v2 = icmp eq v0, v1
+    v3 = extractlane v2, 0
+    return v3
+}
+; run
+
+function %icmp_eq_i64x2() -> b64 {
+block0:
+    v0 = vconst.i64x2 0xffffffffffffffffffffffffffffffff
+    v1 = vconst.i64x2 0xffffffffffffffffffffffffffffffff
+    v2 = icmp eq v0, v1
+    v3 = extractlane v2, 1
+    return v3
+}
+; run
+
+function %icmp_ne_i32x4() -> b1 {
+block0:
+    v0 = vconst.i32x4 [0 1 2 3]
+    v1 = vconst.i32x4 [7 7 7 7]
+    v2 = icmp ne v0, v1
+    v3 = vall_true v2
+    return v3
+}
+; run
+
+function %icmp_ne_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [0 1 2 3 4 5 6 7]
+    v1 = vconst.i16x8 [0 1 2 3 4 5 6 7]
+    v2 = icmp ne v0, v1
+    v3 = vall_true v2
+    v4 = bint.i32 v3
+    v5 = icmp_imm eq v4, 0
+    return v5
+}
+; run
+
+function %icmp_sgt_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    v1 = vconst.i8x16 [1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0xff]
+    v2 = icmp sgt v0, v1
+    v3 = raw_bitcast.i8x16 v2
+    v4 = vconst.i8x16 [0 0 0xff 0 0 0 0 0 0 0 0 0 0 0 0 0xff]
+    v7 = icmp eq v3, v4
+    v8 = vall_true v7
+    return v8
+}
+; run
+
+function %icmp_sgt_i64x2() -> b1 {
+block0:
+    v0 = vconst.i64x2 [0 -42]
+    v1 = vconst.i64x2 [-1 -43]
+    v2 = icmp sgt v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %icmp_ugt_i8x16() -> b1 {
+block0:
+    v0 = vconst.i8x16 [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
+    v1 = vconst.i8x16 [0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+    v2 = icmp ugt v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %icmp_sge_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [-1 1 2 3 4 5 6 7]
+    v1 = vconst.i16x8 [-1 1 1 1 1 1 1 1]
+    v2 = icmp sge v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %icmp_uge_i32x4() -> b1 {
+block0:
+    v0 = vconst.i32x4 [1 2 3 4]
+    v1 = vconst.i32x4 [1 1 1 1]
+    v2 = icmp uge v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %icmp_slt_i32x4() -> b1 {
+block0:
+    v0 = vconst.i32x4 [-1 1 1 1]
+    v1 = vconst.i32x4 [1 2 3 4]
+    v2 = icmp slt v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %icmp_ult_i32x4() -> b1 {
+block0:
+    v0 = vconst.i32x4 [1 1 1 1]
+    v1 = vconst.i32x4 [-1 2 3 4] ; -1 = 0xffff... will be greater than 1 when unsigned
+    v2 = icmp ult v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+
+function %icmp_ult_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [-1 -1 -1 -1 -1 -1 -1 -1]
+    v1 = vconst.i16x8 [-1 -1 -1 -1 -1 -1 -1 -1]
+    v2 = icmp ult v0, v1
+    v3 = vconst.i16x8 0x00
+    v4 = raw_bitcast.i16x8 v2
+    v5 = icmp eq v3, v4
+    v8 = vall_true v5
+    return v8
+}
+; run
+
+function %icmp_sle_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [-1 -1 0 0 0 0 0 0]
+    v1 = vconst.i16x8 [-1  0 0 0 0 0 0 0]
+    v2 = icmp sle v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %icmp_ule_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [-1  0 0 0 0 0 0 0]
+    v1 = vconst.i16x8 [-1 -1 0 0 0 0 0 0]
+    v2 = icmp ule v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_eq_f32x4() -> b1 {
+block0:
+    v0 = vconst.f32x4 [0.0 -0x4.2 0x0.33333 -0.0]
+    v1 = vconst.f32x4 [0.0 -0x4.2 0x0.33333 -0.0]
+    v2 = fcmp eq v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_lt_f32x4() -> b1 {
+block0:
+    v0 = vconst.f32x4 [0.0      -0x4.2  0x0.0       -0.0]
+    v1 = vconst.f32x4 [0x0.001  0x4.2   0x0.33333   0x1.0]
+    v2 = fcmp lt v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_ge_f64x2() -> b1 {
+block0:
+    v0 = vconst.f64x2 [0x0.0  0x4.2]
+    v1 = vconst.f64x2 [0.0    0x4.1]
+    v2 = fcmp ge v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_uno_f64x2() -> b1 {
+block0:
+    v0 = vconst.f64x2 [0.0  NaN]
+    v1 = vconst.f64x2 [NaN  0x4.1]
+    v2 = fcmp uno v0, v1
+    v8 = vall_true v2
+    return v8
+}
+; run
+
+function %fcmp_gt_nans_f32x4() -> b1 {
+block0:
+    v0 = vconst.f32x4 [NaN 0x42.0 -NaN NaN]
+    v1 = vconst.f32x4 [NaN NaN 0x42.0 Inf]
+    v2 = fcmp gt v0, v1
+    ; now check that the result v2 is all zeroes
+    v3 = vconst.i32x4 0x00
+    v4 = raw_bitcast.i32x4 v2
+    v5 = icmp eq v3, v4
+    v8 = vall_true v5
+    return v8
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-conversion.clif
+++ b/cranelift/filetests/filetests/runtests/simd-conversion.clif
@@ -0,0 +1,41 @@
+test run
+target aarch64
+; target s390x TODO: Not yet implemented on s390x
+set enable_simd
+target x86_64 machinst
+set enable_simd
+target x86_64 legacy
+
+function %fcvt_from_sint(i32x4) -> f32x4 {
+block0(v0: i32x4):
+    v1 = fcvt_from_sint.f32x4 v0
+    return v1
+}
+; run: %fcvt_from_sint([-1 0 1 123456789]) == [-0x1.0 0.0 0x1.0 0x75bcd18.0]
+; Note that 123456789 rounds to 123456792.0, an error of 3
+
+function %fcvt_from_uint(i32x4) -> f32x4 {
+block0(v0: i32x4):
+    v1 = fcvt_from_uint.f32x4 v0
+    return v1
+}
+; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
+; run: %fcvt_from_uint([0xFFFFFFFF 0 1 123456789]) == [0x100000000.0 0.0 0x1.0 0x75bcd18.0]
+; Note that 0xFFFFFFFF is decimal 4,294,967,295 and is rounded up 1 to 4,294,967,296 in f32x4.
+
+function %fcvt_to_sint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_sint_sat.i32x4 v0
+    return v1
+}
+; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
+; run: %fcvt_to_sint_sat([-0x8.1 0x0.0 0x0.0 -0x1.0p100]) == [-8 0 0 0x80000000]
+
+function %fcvt_to_uint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_uint_sat.i32x4 v0
+    return v1
+}
+; run: %fcvt_to_uint_sat([0x1.0 0x4.2 0x4.6 0x1.0p100]) == [1 4 4 0xFFFFFFFF]
+; run: %fcvt_to_uint_sat([-0x8.1 -0x0.0 0x0.0 -0x1.0p100]) == [0 0 0 0]
+; run: %fcvt_to_uint_sat([0xB2D05E00.0 0.0 0.0 0.0]) == [3000000000 0 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-lane-access-legacy.clif
+++ b/cranelift/filetests/filetests/runtests/simd-lane-access-legacy.clif
@@ -0,0 +1,221 @@
+test run
+set enable_simd
+target x86_64 legacy
+
+function %shuffle_different_ssa_values() -> b1 {
+block0:
+    v0 = vconst.i8x16 0x00
+    v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31]     ; use the first lane of v0 throughout except use the last lane of v1
+    v3 = extractlane.i8x16 v2, 15
+    v4 = iconst.i8 42
+    v5 = icmp eq v3, v4
+    return v5
+}
+; run
+
+function %shuffle_same_ssa_value() -> b1 {
+block0:
+    v0 = vconst.i8x16 0x01000000_00000000_00000000_00000000     ; note where lane 15 is when written with hexadecimal syntax
+    v1 = shuffle v0, v0, 0x0f0f0f0f_0f0f0f0f_0f0f0f0f_0f0f0f0f  ; use the last lane of v0 to fill all lanes
+    v2 = extractlane.i8x16 v1, 4
+    v3 = iconst.i8 0x01
+    v4 = icmp eq v2, v3
+    return v4
+}
+; run
+
+function %compare_shuffle() -> b1 {
+block0:
+    v1 = vconst.i32x4 [0 1 2 3]
+    v2 = raw_bitcast.i8x16  v1 ; we have to cast because shuffle is type-limited to Tx16
+    ; keep each lane in place from the first vector
+    v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v4 = raw_bitcast.i32x4 v3
+    v5 = extractlane.i32x4 v4, 3
+    v6 = icmp_imm eq v5, 3
+    v7 = extractlane.i32x4 v4, 0
+    v8 = icmp_imm eq v7, 0
+    v9 = band v6, v8
+    return v9
+}
+; run
+
+function %compare_shuffle() -> b32 {
+block0:
+    v1 = vconst.b32x4 [true false true false]
+    v2 = raw_bitcast.b8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    ; pair up the true values to make the entire vector true
+    v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
+    v4 = raw_bitcast.b32x4 v3
+    v5 = extractlane v4, 3
+    v6 = extractlane v4, 0
+    v7 = band v5, v6
+    return v7
+}
+; run
+
+; TODO once SIMD vector comparison is implemented, remove use of extractlane below
+
+function %insertlane_b8() -> b8 {
+block0:
+    v1 = bconst.b8 true
+    v2 = vconst.b8x16 [false false false false false false false false false false false false false
+     false false false]
+    v3 = insertlane v2, v1, 10
+    v4 = extractlane v3, 10
+    return v4
+}
+; run
+
+function %insertlane_f32() -> b1 {
+block0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 0x00
+    v2 = insertlane v1, v0, 1
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %insertlane_f64_lane1() -> b1 {
+block0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, v0, 1
+    v3 = extractlane v2, 1
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %insertlane_f64_lane0() -> b1 {
+block0:
+    v0 = f64const 0x42.42
+    v1 = vconst.f64x2 0x00
+    v2 = insertlane v1, v0, 0
+    v3 = extractlane v2, 0
+    v4 = fcmp eq v3, v0
+    return v4
+}
+; run
+
+function %extractlane_b8() -> b8 {
+block0:
+    v1 = vconst.b8x16 [false false false false false false false false false false true false false
+    false false false]
+    v2 = extractlane v1, 10
+    return v2
+}
+; run
+
+function %extractlane_i16() -> b1 {
+block0:
+    v0 = vconst.i16x8 0x00080007000600050004000300020001
+    v1 = extractlane v0, 1
+    v2 = icmp_imm eq v1, 2
+    return v2
+}
+; run
+
+function %extractlane_f32() -> b1 {
+block0:
+    v0 = f32const 0x42.42
+    v1 = vconst.f32x4 [0x00.00 0x00.00 0x00.00 0x42.42]
+    v2 = extractlane v1, 3
+    v3 = fcmp eq v2, v0
+    return v3
+}
+; run
+
+function %extractlane_i32_with_vector_reuse() -> b1 {
+block0:
+    v0 = iconst.i32 42
+    v1 = iconst.i32 99
+
+    v2 = splat.i32x4 v0
+    v3 = insertlane v2, v1, 2
+
+    v4 = extractlane v3, 3
+    v5 = icmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = icmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+function %extractlane_f32_with_vector_reuse() -> b1 {
+block0:
+    v0 = f32const 0x42.42
+    v1 = f32const 0x99.99
+
+    v2 = splat.f32x4 v0
+    v3 = insertlane v2, v1, 2
+
+    v4 = extractlane v3, 3
+    v5 = fcmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = fcmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+function %swizzle() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v1 = vconst.i8x16 [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 42]
+    v2 = swizzle.i8x16 v0, v1 ; reverse the lanes, with over-large index 42 using lane 0
+
+    v3 = vconst.i8x16 [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %swizzle_with_overflow() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v1 = vconst.i8x16 [16 250 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    v2 = swizzle.i8x16 v0, v1 ; 250 should overflow but saturate so that the MSB is set (PSHUFB uses this to shuffle from lane 0)
+
+    v3 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %unpack_low() -> b1 {
+block0:
+    v0 = vconst.i32x4 [0 1 2 3]
+    v1 = vconst.i32x4 [4 5 6 7]
+    v2 = x86_punpckl v0, v1
+
+    v3 = vconst.i32x4 [0 4 1 5]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run
+
+function %snarrow(i32x4, i32x4) -> i16x8 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = snarrow v0, v1
+    return v2
+}
+; run: %snarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 -1 0x7fff 4 5 -6 0xffff]
+
+function %unarrow(i32x4, i32x4) -> i16x8 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = unarrow v0, v1
+    return v2
+}
+; run: %unarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 0 0xffff 4 5 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-lane-access.clif
+++ b/cranelift/filetests/filetests/runtests/simd-lane-access.clif
@@ -0,0 +1,211 @@
+test run
+; target aarch64 TODO: Not yet implemented on aarch64
+; target s390x TODO: Not yet implemented on s390x
+set enable_simd
+target x86_64 machinst
+
+;; shuffle
+
+function %shuffle_different_ssa_values() -> i8x16 {
+block0:
+    v0 = vconst.i8x16 0x00
+    v1 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+    v2 = shuffle v0, v1, [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 31] ; use the first lane of v0 throughout except use the last lane of v1
+    return v2
+}
+; run: %shuffle_different_ssa_values() == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 42]
+
+function %shuffle_same_ssa_value() -> i8x16 {
+block0:
+    v0 = vconst.i8x16 0x01000000_00000000_00000000_00000000     ; note where lane 15 is when written with hexadecimal syntax
+    v1 = shuffle v0, v0, 0x0f0f0f0f_0f0f0f0f_0f0f0f0f_0f0f0f0f  ; use the last lane of v0 to fill all lanes
+    return v1
+}
+; run: %shuffle_same_ssa_value() == [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
+
+function %shuffle_i32x4_in_same_place() -> i32x4 {
+block0:
+    v1 = vconst.i32x4 [0 1 2 3]
+    v2 = raw_bitcast.i8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    ; keep each lane in place from the first vector
+    v3 = shuffle v2, v2, [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v4 = raw_bitcast.i32x4 v3
+    return v4
+}
+; run: %shuffle_in_same_place() == [0 1 2 3]
+
+function %shuffle_b32x4_to_all_true() -> i32x4 {
+block0:
+    v1 = vconst.b32x4 [true false true false]
+    v2 = raw_bitcast.b8x16 v1 ; we have to cast because shuffle is type-limited to Tx16
+    ; pair up the true values to make the entire vector true
+    v3 = shuffle v2, v2, [0 1 2 3 0 1 2 3 8 9 10 11 8 9 10 11]
+    v4 = raw_bitcast.i32x4 v3 ; TODO store.b32x4 is unavailable; see https://github.com/bytecodealliance/wasmtime/issues/2237
+    return v4
+}
+; run: %shuffle_b32x4_to_all_true() == [0xffffffff 0xffffffff 0xffffffff 0xffffffff]
+
+
+
+;; swizzle
+
+function %swizzle(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = swizzle.i8x16 v0, v1
+    return v2
+}
+; reverse the lanes, with over-large index 42 using lane 0
+; run: %swizzle([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 42]) == [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+; 250 should overflow but saturate so that the MSB is set (PSHUFB uses this to shuffle from lane 0)
+; run: %swizzle([0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15], [16 250 0 0 0 0 0 0 0 0 0 0 0 0 0 0]) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+
+
+
+;; insertlane
+
+function %insertlane_i8x16_first(i8x16, i8) -> i8x16 {
+block0(v1: i8x16, v2: i8):
+    v3 = insertlane v1, v2, 0
+    return v3
+}
+; run: %insertlane_i8x16_first([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], 0xff) == [0xff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+
+function %insertlane_f32x4_second(f32x4, f32) -> f32x4 {
+block0(v1: f32x4, v2: f32):
+    v3 = insertlane v1, v2, 1
+    return v3
+}
+; run: %insertlane_f32x4_second([0.0 0.0 0.0 0.0], 0x42.42) == [0.0 0x42.42 0.0 0.0]
+
+function %insertlane_f64x2_first(f64x2, f64) -> f64x2 {
+block0(v1: f64x2, v2: f64):
+    v3 = insertlane v1, v2, 0
+    return v3
+}
+; run: %insertlane_f64x2_first([0.0 0.0], 0x42.42) == [0x42.42 0.0]
+
+function %insertlane_f64x2_second(f64x2, f64) -> f64x2 {
+block0(v1: f64x2, v2: f64):
+    v3 = insertlane v1, v2, 1
+    return v3
+}
+; run: %insertlane_f64x2_second([0.0 0.0], 0x42.42) == [0.0 0x42.42]
+
+
+
+;; extractlane
+
+function %extractlane_b8x16() -> i8 {
+block0:
+    v1 = vconst.b8x16 [false false false false false false false false false false true false false
+    false false false]
+    v2 = extractlane v1, 10
+    v3 = raw_bitcast.i8 v2
+    return v3
+}
+; run: %extractlane_b8x16_last() == 0xff
+
+function %extractlane_i16x8_second(i16x8) -> i16 {
+block0(v0: i16x8):
+    v1 = extractlane v0, 1
+    return v1
+}
+; run: %extractlane_i16x8_second(0x00080007000600050004000300020001) == 2
+
+function %extractlane_f32x4_last(f32x4) -> f32 {
+block0(v0: f32x4):
+    v1 = extractlane v0, 3
+    return v1
+}
+; run: %extractlane_f32x4_last([0x00.00 0x00.00 0x00.00 0x42.42]) == 0x42.42
+
+function %extractlane_i32_with_vector_reuse() -> b1 {
+block0:
+    v0 = iconst.i32 42
+    v1 = iconst.i32 99
+
+    v2 = vconst.i32x4 [42 42 42 42]
+    v3 = insertlane v2, v1, 2
+
+    v4 = extractlane v3, 3
+    v5 = icmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = icmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+function %extractlane_f32_with_vector_reuse() -> b1 {
+block0:
+    v0 = f32const 0x42.42
+    v1 = f32const 0x99.99
+
+    v2 = vconst.f32x4 [0x42.42 0x42.42 0x42.42 0x42.42]
+    v3 = insertlane v2, v1, 2
+
+    v4 = extractlane v3, 3
+    v5 = fcmp eq v4, v0
+
+    v6 = extractlane v3, 2
+    v7 = fcmp eq v6, v1
+
+    v8 = band v5, v7
+    return v8
+}
+; run
+
+
+
+;; splat
+
+function %splat_i64x2() -> b1 {
+block0:
+    v0 = iconst.i64 -1
+    v1 = splat.i64x2 v0
+    v2 = vconst.i64x2 [-1 -1]
+    v3 = icmp eq v1, v2
+    v8 = vall_true v3
+    return v8
+}
+; run
+
+function %splat_i8(i8) -> i8x16 {
+block0(v0: i8):
+    v1 = splat.i8x16 v0
+    return v1
+}
+; run: %splat_i8(0xff) == [0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff 0xff]
+
+function %splat_i32(i32) -> i32x4 {
+block0(v0: i32):
+    v1 = splat.i32x4 v0
+    return v1
+}
+; run: %splat_i32(42) == [42 42 42 42]
+
+function %splat_f64(f64) -> f64x2 {
+block0(v0: f64):
+    v1 = splat.f64x2 v0
+    return v1
+}
+; run: %splat_f64(-0x1.1) == [-0x1.1 -0x1.1]
+
+
+; narrow
+
+function %snarrow(i32x4, i32x4) -> i16x8 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = snarrow v0, v1
+    return v2
+}
+; run: %snarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 -1 0x7fff 4 5 -6 0xffff]
+
+function %unarrow(i32x4, i32x4) -> i16x8 {
+block0(v0: i32x4, v1: i32x4):
+    v2 = unarrow v0, v1
+    return v2
+}
+; run: %unarrow([0 1 -1 0x0001ffff], [4 5 -6 0xffffffff]) == [0 1 0 0xffff 4 5 0 0]
--- a/cranelift/filetests/filetests/runtests/simd-logical.clif
+++ b/cranelift/filetests/filetests/runtests/simd-logical.clif
@@ -0,0 +1,63 @@
+test run
+target aarch64
+; target s390x TODO: Not yet implemented on s390x
+set enable_simd
+target x86_64 machinst
+set enable_simd
+target x86_64 legacy skylake
+
+function %bnot() -> b32 {
+block0:
+    v0 = vconst.b32x4 [true true true false]
+    v1 = bnot v0
+    v2 = extractlane v1, 3
+    return v2
+}
+; run
+
+function %band_not() -> b1 {
+block0:
+    v0 = vconst.i16x8 [1 0 0 0 0 0 0 0]
+    v1 = vconst.i16x8 [0 0 0 0 0 0 0 0]
+    v2 = band_not v0, v1
+    v3 = extractlane v2, 0
+    v4 = icmp_imm eq v3, 1
+    return v4
+}
+; run
+
+function %vany_true_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [1 0 0 0 0 0 0 0]
+    v1 = vany_true v0
+    return v1
+}
+; run
+
+function %vany_true_b32x4() -> b1 {
+block0:
+    v0 = vconst.b32x4 [false false false false]
+    v1 = vany_true v0
+    v2 = bint.i32 v1
+    v3 = icmp_imm eq v2, 0
+    return v3
+}
+; run
+
+function %vall_true_i16x8() -> b1 {
+block0:
+    v0 = vconst.i16x8 [1 0 0 0 0 0 0 0]
+    v1 = vall_true v0
+    v2 = bint.i32 v1
+    v3 = icmp_imm eq v2, 0
+    return v3
+}
+; run
+
+function %vall_true_b32x4() -> b1 {
+block0:
+    v0 = vconst.b32x4 [true true true true]
+    v1 = vall_true v0
+    return v1
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-vconst-optimized-legacy.clif
+++ b/cranelift/filetests/filetests/runtests/simd-vconst-optimized-legacy.clif
@@ -0,0 +1,46 @@
+test run
+set enable_simd
+target x86_64 legacy
+
+function %vconst_syntax() -> b1 {
+block0:
+    v0 = vconst.i32x4 0x00000004_00000003_00000002_00000001     ; build constant using hexadecimal syntax
+    v1 = vconst.i32x4 [1 2 3 4]                                 ; build constant using literal list syntax
+
+    ; verify lane 1 matches
+    v2 = extractlane v0, 1
+    v3 = extractlane v1, 1
+    v4 = icmp eq v3, v2
+
+    ; verify lane 1 has the correct value
+    v5 = icmp_imm eq v3, 2
+
+    v6 = band v4, v5
+    return v6
+}
+; run
+
+; Since both jump tables and constants are emitted after the function body, it is important that any RIP-relative
+; addressing of constants is not incorrect in the presence of jump tables. This test confirms that, even when both
+; jump tables and constants are emitted, the constant addressing works correctly.
+function %vconst_with_jumptables() -> b1 {
+jt0 = jump_table [block0]
+
+block10:
+    v10 = iconst.i64 0
+    br_table v10, block1, jt0
+block0:
+    v0 = iconst.i64 100
+    jump block11(v0)
+block1:
+    v1 = iconst.i64 101
+    jump block11(v1)
+block11(v11: i64):
+    v12 = icmp_imm eq v11, 100        ; We should have jumped through block 0.
+    v13 = vconst.i32x4 [1 2 3 4]
+    v14 = extractlane.i32x4 v13, 1 ; Extract the second element...
+    v15 = icmp_imm eq v14, 2       ; ...which should be the value 2.
+    v16 = band v12, v15
+    return v16
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-vconst.clif
+++ b/cranelift/filetests/filetests/runtests/simd-vconst.clif
@@ -0,0 +1,40 @@
+test run
+; target s390x TODO: Not yet implemented on s390x
+; target aarch64 TODO: Not yet implemented on aarch64
+set enable_simd
+target x86_64 machinst
+set enable_simd
+target x86_64 legacy
+set enable_simd
+target x86_64 legacy skylake
+
+
+function %vconst_zeroes() -> b1 {
+block0:
+    v0 = vconst.i8x16 0x00
+    v1 = extractlane v0, 4
+    v2 = icmp_imm eq v1, 0
+    return v2
+}
+; run
+
+function %vconst_ones() -> b1 {
+block0:
+    v0 = vconst.i8x16 0xffffffffffffffffffffffffffffffff
+    v1 = extractlane v0, 2
+    v2 = icmp_imm eq v1, 0xff
+    return v2
+}
+; run
+
+
+function %splat_i64x2() -> b1 {
+block0:
+    v0 = iconst.i64 -1
+    v1 = splat.i64x2 v0
+    v2 = vconst.i64x2 [-1 -1]
+    v3 = icmp eq v1, v2
+    v8 = vall_true v3
+    return v8
+}
+; run
--- a/cranelift/filetests/filetests/runtests/simd-vselect.clif
+++ b/cranelift/filetests/filetests/runtests/simd-vselect.clif
@@ -0,0 +1,47 @@
+test run
+; target s390x TODO: Not yet implemented on s390x
+target aarch64
+set enable_simd
+target x86_64 machinst
+set enable_simd
+target x86_64 legacy haswell
+
+function %vselect_i8x16() -> i8x16 {
+block0:
+    v1 = vconst.b8x16 [false true false true false true true true true true false false false false false false]
+    v2 = vconst.i8x16 [100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115]
+    v3 = vconst.i8x16 [200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215]
+    v4 = vselect v1, v2, v3
+    return v4
+}
+; run: %vselect_i8x16() == [200 101 202 103 204 105 106 107 108 109 210 211 212 213 214 215]
+
+function %vselect_i16x8() -> i16x8 {
+block0:
+    v1 = vconst.b16x8 [false true false true false true true true]
+    v2 = vconst.i16x8 [100 101 102 103 104 105 106 107]
+    v3 = vconst.i16x8 [200 201 202 203 204 205 206 207]
+    v4 = vselect v1, v2, v3
+    return v4
+}
+; run: %vselect_i16x8() == [200 101 202 103 204 105 106 107]
+
+function %vselect_i32x4() -> i32x4 {
+block0:
+    v1 = vconst.b32x4 [false true false true]
+    v2 = vconst.i32x4 [100 101 102 103]
+    v3 = vconst.i32x4 [200 201 202 203]
+    v4 = vselect v1, v2, v3
+    return v4
+}
+; run: %vselect_i32x4() == [200 101 202 103]
+
+function %vselect_i64x2() -> i64x2 {
+block0:
+    v1 = vconst.b64x2 [false true]
+    v2 = vconst.i64x2 [100 101]
+    v3 = vconst.i64x2 [200 201]
+    v4 = vselect v1, v2, v3
+    return v4
+}
+; run: %vselect_i64x2() == [200 101]
--- a/cranelift/filetests/filetests/runtests/spill-reload.clif
+++ b/cranelift/filetests/filetests/runtests/spill-reload.clif
@@ -0,0 +1,40 @@
+test run
+target s390x
+target aarch64
+target x86_64 machinst
+target x86_64 legacy
+
+function %f(i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> i64 {
+block0(v0: i32, v1: i32, v2: i32, v3: i32, v4: i32, v5: i32, v6: i32, v7: i32, v8: i32, v9: i32, v10: i32, v11: i32, v12: i32, v13: i32, v14: i32, v15: i32, v16: i32, v17: i32, v18: i32, v19: i32):
+  v20 = iadd.i32 v0, v1
+  v21 = iadd.i32 v2, v3
+  v22 = iadd.i32 v4, v5
+  v23 = iadd.i32 v6, v7
+  v24 = iadd.i32 v8, v9
+  v25 = iadd.i32 v10, v11
+  v26 = iadd.i32 v12, v13
+  v27 = iadd.i32 v14, v15
+  v28 = iadd.i32 v16, v17
+  v29 = iadd.i32 v18, v19
+
+  v30 = iadd.i32 v20, v21
+  v31 = iadd.i32 v22, v23
+  v32 = iadd.i32 v24, v25
+  v33 = iadd.i32 v26, v27
+  v34 = iadd.i32 v28, v29
+
+  v35 = iadd.i32 v30, v31
+  v36 = iadd.i32 v32, v33
+  v37 = iadd.i32 v35, v34
+  v38 = iadd.i32 v36, v37
+  ;; v38 should be zero (due to wrapping).
+
+  v39 = iconst.i64 1
+  v40 = uextend.i64 v0  ;; should be reloaded from a spillslot
+  v41 = uextend.i64 v38
+  v42 = iadd.i64 v39, v40
+  v43 = iadd.i64 v42, v41
+  return v43
+}
+
+; run: %f(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000) == 0x80000001