From fa7481a68191bfe91154ef9a2be3e7c84a111f32 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Fri, 6 Mar 2020 11:29:21 -0800
Subject: [PATCH] Add x86 implementation of SIMD swizzle instruction

---
 .../codegen/meta/src/isa/x86/legalize.rs      | 18 +++++++++++
 .../codegen/meta/src/shared/instructions.rs   | 31 ++++++++++++++++++-
 .../isa/x86/simd-lane-access-legalize.clif    | 11 +++++++
 .../isa/x86/simd-lane-access-run.clif         | 26 ++++++++++++++++
 4 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index 4a10c64ed2..e3027a6814 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -338,7 +338,9 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
     let splat = insts.by_name("splat");
     let shuffle = insts.by_name("shuffle");
     let sshr = insts.by_name("sshr");
+    let swizzle = insts.by_name("swizzle");
     let trueif = insts.by_name("trueif");
+    let uadd_sat = insts.by_name("uadd_sat");
     let umax = insts.by_name("umax");
     let umin = insts.by_name("umin");
     let ushr_imm = insts.by_name("ushr_imm");
@@ -375,6 +377,7 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
     let uimm8_one = Literal::constant(&imm.uimm8, 0x01);
     let u128_zeroes = constant(vec![0x00; 16]);
     let u128_ones = constant(vec![0xff; 16]);
+    let u128_seventies = constant(vec![0x70; 16]);
     let a = var("a");
     let b = var("b");
     let c = var("c");
@@ -459,6 +462,21 @@ fn define_simd(shared: &mut SharedDefinitions, x86_instructions: &InstructionGro
         );
     }
 
+    // SIMD swizzle; the following inefficient implementation is due to the Wasm SIMD spec requiring
+    // mask indexes greater than 15 to have the same semantics as a 0 index. For the spec discussion,
+    // see https://github.com/WebAssembly/simd/issues/93.
+    {
+        let swizzle = swizzle.bind(vector(I8, sse_vector_size));
+        narrow.legalize(
+            def!(a = swizzle(x, y)),
+            vec![
+                def!(b = vconst(u128_seventies)),
+                def!(c = uadd_sat(y, b)),
+                def!(a = x86_pshufb(x, c)),
+            ],
+        );
+    }
+
     // SIMD bnot
     for ty in ValueType::all_lane_types().filter(allowed_simd_type) {
         let bnot = bnot.bind(vector(ty, sse_vector_size));
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 6fd7db155f..9bc344eec6 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -517,7 +517,36 @@ fn define_simd_lane_access(
         .operands_out(vec![a]),
     );
 
-    let x = &Operand::new("x", TxN).with_doc("SIMD vector to modify");
+    let I8x16 = &TypeVar::new(
+        "I8x16",
+        "A SIMD vector type consisting of 16 lanes of 8-bit integers",
+        TypeSetBuilder::new()
+            .ints(8..8)
+            .simd_lanes(16..16)
+            .includes_scalars(false)
+            .build(),
+    );
+    let x = &Operand::new("x", I8x16).with_doc("Vector to modify by re-arranging lanes");
+    let y = &Operand::new("y", I8x16).with_doc("Mask for re-arranging lanes");
+
+    ig.push(
+        Inst::new(
+            "swizzle",
+            r#"
+        Vector swizzle.
+
+        Returns a new vector with byte-width lanes selected from the lanes of the first input 
+        vector ``x`` specified in the second input vector ``s``. The indices ``i`` in range 
+        ``[0, 15]`` select the ``i``-th element of ``x``. For indices outside of the range the 
+        resulting lane is 0. Note that this operates on byte-width lanes.
+        "#,
+            &formats.binary,
+        )
+        .operands_in(vec![x, y])
+        .operands_out(vec![a]),
+    );
+
+    let x = &Operand::new("x", TxN).with_doc("The vector to modify");
     let y = &Operand::new("y", &TxN.lane_of()).with_doc("New lane value");
     let Idx = &Operand::new("Idx", &imm.uimm8).with_doc("Lane index");
 
diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif
index 547acbd5c4..3c222f1f7f 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-legalize.clif
@@ -83,3 +83,14 @@ block0:
 ; nextln:     v4 = vconst.i8x16 0x00
 ; nextln:     v1 = x86_pshufb v3, v4
 ; nextln:     return v1
+
+function %swizzle() -> i8x16 {
+block0:
+    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v1 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v2 = swizzle.i8x16 v0, v1
+    ; check: v3 = vconst.i8x16 0x70707070707070707070707070707070
+    ; nextln: v4 = uadd_sat v1, v3
+    ; nextln: v2 = x86_pshufb v0, v4
+    return v2
+}
diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
index 80445cc67f..a99660f8fd 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-run.clif
@@ -165,3 +165,29 @@ block0:
     return v8
 }
 ; run
+
+function %swizzle() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v1 = vconst.i8x16 [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 42]
+    v2 = swizzle.i8x16 v0, v1 ; reverse the lanes, with over-large index 42 using lane 0
+
+    v3 = vconst.i8x16 [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run:
+
+function %swizzle_with_overflow() -> b1 {
+block0:
+    v0 = vconst.i8x16 [0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+    v1 = vconst.i8x16 [16 250 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    v2 = swizzle.i8x16 v0, v1 ; 250 should overflow but saturate so that the MSB is set (PSHUFB uses this to shuffle from lane 0)
+
+    v3 = vconst.i8x16 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+    v4 = icmp eq v2, v3
+    v5 = vall_true v4
+    return v5
+}
+; run: