From c8ddf8a34ced624b2c1fbb63bc786059a6387b29 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 7 Jul 2020 16:13:50 -0700
Subject: [PATCH] Encode `[u|s]widen_low` for x86

---
 .../codegen/meta/src/isa/x86/encodings.rs     | 12 +++
 cranelift/codegen/meta/src/isa/x86/opcodes.rs |  4 +-
 .../codegen/meta/src/shared/instructions.rs   | 81 +++++++++++++++++--
 .../codegen/src/isa/aarch64/lower_inst.rs     |  7 +-
 .../isa/x86/simd-conversion-binemit.clif      |  9 ++-
 5 files changed, 103 insertions(+), 10 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index a58348d49b..da04019a1b 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1669,6 +1669,7 @@ fn define_simd(
     let ssub_sat = shared.by_name("ssub_sat");
     let store = shared.by_name("store");
     let store_complex = shared.by_name("store_complex");
+    let swiden_low = shared.by_name("swiden_low");
     let uadd_sat = shared.by_name("uadd_sat");
     let uload8x8 = shared.by_name("uload8x8");
     let uload8x8_complex = shared.by_name("uload8x8_complex");
@@ -1678,6 +1679,7 @@ fn define_simd(
     let uload32x2_complex = shared.by_name("uload32x2_complex");
     let snarrow = shared.by_name("snarrow");
     let unarrow = shared.by_name("unarrow");
+    let uwiden_low = shared.by_name("uwiden_low");
     let ushr_imm = shared.by_name("ushr_imm");
     let usub_sat = shared.by_name("usub_sat");
     let vconst = shared.by_name("vconst");
@@ -1915,6 +1917,16 @@ fn define_simd(
         let unarrow = unarrow.bind(vector(*ty, sse_vector_size));
         e.enc_both_inferred_maybe_isap(unarrow, rec_fa.opcodes(*opcodes), *isap);
     }
+    for (ty, swiden_opcode, uwiden_opcode) in &[
+        (I8, &PMOVSXBW[..], &PMOVZXBW[..]),
+        (I16, &PMOVSXWD[..], &PMOVZXWD[..]),
+    ] {
+        let isap = Some(use_sse41_simd);
+        let swiden_low = swiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(swiden_low, rec_furm.opcodes(*swiden_opcode), isap);
+        let uwiden_low = uwiden_low.bind(vector(*ty, sse_vector_size));
+        e.enc_both_inferred_maybe_isap(uwiden_low, rec_furm.opcodes(*uwiden_opcode), isap);
+    }
     for ty in &[I8, I16, I32, I64] {
         e.enc_both_inferred_maybe_isap(
             x86_palignr.bind(vector(*ty, sse_vector_size)),
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index 25685593a6..09c07c458f 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -477,7 +477,7 @@ pub static PMOVSXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x20];
 pub static PMOVSXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x23];
 
 /// Sign extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
-/// integers in xmm1.
+/// integers in xmm1 (SSE4.1).
 pub static PMOVSXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x25];
 
 /// Zero extend 8 packed 8-bit integers in the low 8 bytes of xmm2/m64 to 8 packed 16-bit
@@ -489,7 +489,7 @@ pub static PMOVZXBW: [u8; 4] = [0x66, 0x0f, 0x38, 0x30];
 pub static PMOVZXWD: [u8; 4] = [0x66, 0x0f, 0x38, 0x33];
 
 /// Zero extend 2 packed 32-bit integers in the low 8 bytes of xmm2/m64 to 2 packed 64-bit
-/// integers in xmm1.
+/// integers in xmm1 (SSE4.1).
 pub static PMOVZXDQ: [u8; 4] = [0x66, 0x0f, 0x38, 0x35];
 
 /// Multiply the packed signed word integers in xmm1 and xmm2/m128, and store the low 16 bits of
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index c78787ce82..1c06c4a325 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -3883,9 +3883,9 @@ pub(crate) fn define(
         .constraints(vec![WiderOrEq(Int.clone(), IntTo.clone())]),
     );
 
-    let I16xN = &TypeVar::new(
-        "I16xN",
-        "A SIMD vector type containing integers 16-bits wide and up",
+    let I16or32xN = &TypeVar::new(
+        "I16or32xN",
+        "A SIMD vector type containing integer lanes 16 or 32 bits wide",
         TypeSetBuilder::new()
             .ints(16..32)
             .simd_lanes(4..8)
@@ -3893,9 +3893,9 @@ pub(crate) fn define(
             .build(),
     );
 
-    let x = &Operand::new("x", I16xN);
-    let y = &Operand::new("y", I16xN);
-    let a = &Operand::new("a", &I16xN.split_lanes());
+    let x = &Operand::new("x", I16or32xN);
+    let y = &Operand::new("y", I16or32xN);
+    let a = &Operand::new("a", &I16or32xN.split_lanes());
 
     ig.push(
         Inst::new(
@@ -3934,6 +3934,75 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let I8or16xN = &TypeVar::new(
+        "I8or16xN",
+        "A SIMD vector type containing integer lanes 8 or 16 bits wide.",
+        TypeSetBuilder::new()
+            .ints(8..16)
+            .simd_lanes(8..16)
+            .includes_scalars(false)
+            .build(),
+    );
+
+    let x = &Operand::new("x", I8or16xN);
+    let a = &Operand::new("a", &I8or16xN.merge_lanes());
+
+    ig.push(
+        Inst::new(
+            "swiden_low",
+            r#"
+        Widen the low lanes of `x` using signed extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "swiden_high",
+            r#"
+        Widen the high lanes of `x` using signed extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "uwiden_low",
+            r#"
+        Widen the low lanes of `x` using unsigned extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
+    ig.push(
+        Inst::new(
+            "uwiden_high",
+            r#"
+        Widen the high lanes of `x` using unsigned extension.
+        
+        This will double the lane width and halve the number of lanes.
+            "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
     let IntTo = &TypeVar::new(
         "IntTo",
         "A larger integer type with the same number of lanes",
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 7fb878c87a..88751a1478 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2154,7 +2154,12 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::AvgRound => unimplemented!(),
         Opcode::Iabs => unimplemented!(),
-        Opcode::Snarrow | Opcode::Unarrow => unimplemented!(),
+        Opcode::Snarrow
+        | Opcode::Unarrow
+        | Opcode::SwidenLow
+        | Opcode::SwidenHigh
+        | Opcode::UwidenLow
+        | Opcode::UwidenHigh => unimplemented!(),
         Opcode::TlsValue => unimplemented!(),
     }
 
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
index b1a95c52d7..72e3412279 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-binemit.clif
@@ -1,6 +1,6 @@
 test binemit
 set enable_simd
-target x86_64 has_ssse3=true
+target x86_64 nehalem
 
 ; Ensure raw_bitcast emits no instructions.
 function %raw_bitcast_i16x8_to_b32x4() {
@@ -17,3 +17,10 @@ block0(v0: i32x4 [%xmm6], v1: i32x4 [%xmm4]):
 [-, %xmm6]  v3 = x86_palignr v0, v1, 3      ; bin: 66 0f 3a 0f f4 03
             return
 }
+
+function %conversions_i16x8(i16x8) {
+block0(v0: i16x8 [%xmm6]):
+[-, %xmm2]  v1 = swiden_low v0              ; bin: 66 0f 38 23 d6
+[-, %xmm11] v2 = uwiden_low v0              ; bin: 66 44 0f 38 33 de
+            return
+}