From 508f8fa5a921240ff2d25fd2eb859366c87834a3 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 2 Mar 2021 09:54:19 -0800
Subject: [PATCH] [x64] Add i64x2.abs

This instruction has a single instruction lowering in AVX512F/VL and a three instruction lowering in AVX but neither is currently supported in the x64 backend. To implement this, we instead subtract the vector from 0 and use a blending instruction to pick the lanes containing the absolute value.
---
 build.rs                                      |  5 +++--
 cranelift/codegen/src/isa/x64/inst/args.rs    |  5 ++++-
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  1 +
 .../codegen/src/isa/x64/inst/emit_tests.rs    |  6 +++++
 cranelift/codegen/src/isa/x64/lower.rs        | 22 ++++++++++++++++++-
 cranelift/wasm/src/code_translator.rs         |  3 +--
 6 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/build.rs b/build.rs
index 5d7d836d83..1c55f0ef58 100644
--- a/build.rs
+++ b/build.rs
@@ -183,7 +183,6 @@ fn experimental_x64_should_panic(testsuite: &str, testname: &str, strategy: &str
 
     match (testsuite, testname) {
         ("simd", "simd_i8x16_arith2") => return true, // Unsupported feature: proposed simd operator I8x16Popcnt
-        ("simd", "simd_i64x2_arith2") => return true, // Unsupported feature: proposed simd operator I64x2Abs
         ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6
         ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
         ("simd", "simd_i16x8_extmul_i8x16") => return true,
@@ -231,7 +230,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
 
             // These are new instructions that are not really implemented in any backend.
             ("simd", "simd_i8x16_arith2")
-            | ("simd", "simd_i64x2_arith2")
             | ("simd", "simd_conversions")
             | ("simd", "simd_i16x8_extadd_pairwise_i8x16")
             | ("simd", "simd_i16x8_extmul_i8x16")
@@ -250,6 +248,9 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
             | ("simd", "simd_store64_lane")
             | ("simd", "simd_store8_lane") => return true,
 
+            // These are only implemented on x64.
+            ("simd", "simd_i64x2_arith2") => return !cfg!(feature = "experimental_x64"),
+
             // These are only implemented on aarch64 and x64.
             ("simd", "simd_i64x2_cmp")
             | ("simd", "simd_f32x4_pmin_pmax")
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index 930839459b..f68c50449f 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -470,6 +470,7 @@ pub enum SseOpcode {
     Andpd,
     Andnps,
     Andnpd,
+    Blendvpd,
     Comiss,
     Comisd,
     Cmpps,
@@ -758,7 +759,8 @@ impl SseOpcode {
             | SseOpcode::Palignr
             | SseOpcode::Pshufb => SSSE3,
 
-            SseOpcode::Insertps
+            SseOpcode::Blendvpd
+            | SseOpcode::Insertps
             | SseOpcode::Packusdw
             | SseOpcode::Pcmpeqq
             | SseOpcode::Pextrb
@@ -816,6 +818,7 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Andps => "andps",
             SseOpcode::Andnps => "andnps",
             SseOpcode::Andnpd => "andnpd",
+            SseOpcode::Blendvpd => "blendvpd",
             SseOpcode::Cmpps => "cmpps",
             SseOpcode::Cmppd => "cmppd",
             SseOpcode::Cmpss => "cmpss",
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index eeb81d19df..7ef5304635 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1845,6 +1845,7 @@ pub(crate) fn emit(
                 SseOpcode::Andpd => (LegacyPrefixes::_66, 0x0F54, 2),
                 SseOpcode::Andnps => (LegacyPrefixes::None, 0x0F55, 2),
                 SseOpcode::Andnpd => (LegacyPrefixes::_66, 0x0F55, 2),
+                SseOpcode::Blendvpd => (LegacyPrefixes::_66, 0x0F3815, 3),
                 SseOpcode::Cvttps2dq => (LegacyPrefixes::_F3, 0x0F5B, 2),
                 SseOpcode::Cvtdq2ps => (LegacyPrefixes::None, 0x0F5B, 2),
                 SseOpcode::Divps => (LegacyPrefixes::None, 0x0F5E, 2),
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index f5d7535da4..e407910c2d 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3426,6 +3426,12 @@ fn test_x64_emit() {
         "orps    %xmm5, %xmm4",
     ));
 
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Blendvpd, RegMem::reg(xmm15), w_xmm4),
+        "66410F3815E7",
+        "blendvpd %xmm15, %xmm4",
+    ));
+
     // ========================================================
     // XMM_RM_R: Integer Packed
 
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 827edc69ce..190462caaf 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -1853,7 +1853,27 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             let src = input_to_reg_mem(ctx, inputs[0]);
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
-            if ty.is_vector() {
+            if ty == types::I64X2 {
+                // This lowering could be a single instruction with AVX512F/VL's VPABSQ instruction.
+                // Instead, we use a separate register, `tmp`, to contain the results of `0 - src`
+                // and then blend in those results with `BLENDVPD` if the MSB of `tmp` was set to 1
+                // (i.e. if `tmp` was negative or, conversely, if `src` was originally positive).
+
+                // Emit all 0s into the `tmp` register.
+                let tmp = ctx.alloc_tmp(ty).only_reg().unwrap();
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+                // Subtract the lanes from 0 and set up `dst`.
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Psubq, src.clone(), tmp));
+                ctx.emit(Inst::gen_move(dst, tmp.to_reg(), ty));
+                // Choose the subtracted lanes when `tmp` has an MSB of 1. BLENDVPD's semantics
+                // require the "choice" mask to be in XMM0.
+                ctx.emit(Inst::gen_move(
+                    Writable::from_reg(regs::xmm0()),
+                    tmp.to_reg(),
+                    ty,
+                ));
+                ctx.emit(Inst::xmm_rm_r(SseOpcode::Blendvpd, src, dst));
+            } else if ty.is_vector() {
                 let opcode = match ty {
                     types::I8X16 => SseOpcode::Pabsb,
                     types::I16X8 => SseOpcode::Pabsw,
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index adeb7bda2d..ba933d7b53 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1564,7 +1564,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = pop1_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().ineg(a))
         }
-        Operator::I8x16Abs | Operator::I16x8Abs | Operator::I32x4Abs => {
+        Operator::I8x16Abs | Operator::I16x8Abs | Operator::I32x4Abs | Operator::I64x2Abs => {
             let a = pop1_with_bitcast(state, type_of(op), builder);
             state.push1(builder.ins().iabs(a))
         }
@@ -1852,7 +1852,6 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I64x2ExtMulHighI32x4S
         | Operator::I64x2ExtMulLowI32x4U
         | Operator::I64x2ExtMulHighI32x4U
-        | Operator::I64x2Abs
         | Operator::I64x2AllTrue
         | Operator::I16x8ExtAddPairwiseI8x16S
         | Operator::I16x8ExtAddPairwiseI8x16U