From 2d676d838fc718e62a8b5c46ba00745b1f8590c9 Mon Sep 17 00:00:00 2001
From: Johnnie Birch <johnnie.l.birch.jr@intel.com>
Date: Sat, 5 Jun 2021 23:15:50 -0700
Subject: [PATCH] Implements f64x2.convert_low_i32x4_u for x64

---
 build.rs                                      |  1 -
 .../codegen/meta/src/shared/instructions.rs   | 21 ++++++++
 .../codegen/src/isa/aarch64/lower_inst.rs     |  1 +
 cranelift/codegen/src/isa/s390x/lower.rs      |  1 +
 cranelift/codegen/src/isa/x64/inst/args.rs    |  3 ++
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  1 +
 .../codegen/src/isa/x64/inst/emit_tests.rs    |  6 +++
 cranelift/codegen/src/isa/x64/lower.rs        | 52 +++++++++++++++++++
 cranelift/codegen/src/isa/x64/mod.rs          |  6 +--
 cranelift/interpreter/src/step.rs             |  1 +
 cranelift/wasm/src/code_translator.rs         |  7 ++-
 11 files changed, 94 insertions(+), 6 deletions(-)
diff --git a/build.rs b/build.rs
index 59d0914562..f049208a60 100644
--- a/build.rs
+++ b/build.rs
@@ -189,7 +189,6 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool {
     }
 
     match (testsuite, testname) {
-        ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6
         ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true,
         ("simd", "simd_i16x8_extmul_i8x16") => return true,
         ("simd", "simd_i16x8_q15mulr_sat_s") => return true,
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 06e20c6198..1964ea8f75 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4457,6 +4457,27 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    ig.push(
+        Inst::new(
+            "fcvt_low_from_uint",
+            r#"
+
+        Converts packed unsigned 32-bit integers to packed double precision floating point.
+
+        Considering only the low half of the register, each lane in `x` is interpreted as a
+        unsigned 32-bit integer that is then converted to a double precision float. This
+        which are converted to occupy twice the number of bits. No rounding should be needed
+        for the resulting float.
+
+        The result type will have half the number of vector lanes as the input.
+
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
     let WideInt = &TypeVar::new(
         "WideInt",
         "An integer type with lanes from `i16` upwards",
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index 8c46602cbd..67e99917e6 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -3557,6 +3557,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
         Opcode::ConstAddr
         | Opcode::FcvtLowFromSint
+        | Opcode::FcvtLowFromUint
         | Opcode::Fvdemote
         | Opcode::FvpromoteLow
         | Opcode::Vconcat
diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs
index 8ab66add04..9cc33eff50 100644
--- a/cranelift/codegen/src/isa/s390x/lower.rs
+++ b/cranelift/codegen/src/isa/s390x/lower.rs
@@ -2867,6 +2867,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::UwidenHigh
         | Opcode::WideningPairwiseDotProductS
         | Opcode::SqmulRoundSat
+        | Opcode::FcvtLowFromUint
         | Opcode::FvpromoteLow
         | Opcode::Fvdemote => {
             // TODO
diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs
index c362075061..13ad1ca836 100644
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -635,6 +635,7 @@ pub enum SseOpcode {
     Subsd,
     Ucomiss,
     Ucomisd,
+    Unpcklps,
     Xorps,
     Xorpd,
 }
@@ -675,6 +676,7 @@ impl SseOpcode {
             | SseOpcode::Subps
             | SseOpcode::Subss
             | SseOpcode::Ucomiss
+            | SseOpcode::Unpcklps
             | SseOpcode::Xorps => SSE,
 
             SseOpcode::Addpd
@@ -993,6 +995,7 @@ impl fmt::Debug for SseOpcode {
             SseOpcode::Subsd => "subsd",
             SseOpcode::Ucomiss => "ucomiss",
             SseOpcode::Ucomisd => "ucomisd",
+            SseOpcode::Unpcklps => "unpcklps",
             SseOpcode::Xorps => "xorps",
             SseOpcode::Xorpd => "xorpd",
         };
diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 441d89fa91..534b6be168 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1529,6 +1529,7 @@ pub(crate) fn emit(
                 SseOpcode::Subpd => (LegacyPrefixes::_66, 0x0F5C, 2),
                 SseOpcode::Subss => (LegacyPrefixes::_F3, 0x0F5C, 2),
                 SseOpcode::Subsd => (LegacyPrefixes::_F2, 0x0F5C, 2),
+                SseOpcode::Unpcklps => (LegacyPrefixes::None, 0x0F14, 2),
                 SseOpcode::Xorps => (LegacyPrefixes::None, 0x0F57, 2),
                 SseOpcode::Xorpd => (LegacyPrefixes::_66, 0x0F57, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index a77882c3f6..3ecf4b7e62 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -3717,6 +3717,12 @@ fn test_x64_emit() {
         "punpcklbw %xmm1, %xmm8",
     ));
 
+    insns.push((
+        Inst::xmm_rm_r(SseOpcode::Unpcklps, RegMem::reg(xmm11), w_xmm2),
+        "410F14D3",
+        "unpcklps %xmm11, %xmm2",
+    ));
+
     // ========================================================
     // XMM_RM_R: Integer Conversion
     insns.push((
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index ad74062e7e..4e0d67e2d1 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -4154,6 +4154,58 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 dst,
             ));
         }
+        Opcode::FcvtLowFromUint => {
+            // Algorithm uses unpcklps to help create a float that is equivalent
+            // 0x1.0p52 + double(src). 0x1.0p52 is unique because at this exponent
+            // every value of the mantissa represents a corresponding uint32 number.
+            // When we subtract 0x1.0p52 we are left with double(src).
+            let src = put_input_in_reg(ctx, inputs[0]);
+            let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            let uint_mask = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
+
+            ctx.emit(Inst::gen_move(dst, src, types::I32X4));
+
+            static UINT_MASK: [u8; 16] = [
+                0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                0x00, 0x00,
+            ];
+
+            let uint_mask_const = ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK));
+
+            ctx.emit(Inst::xmm_load_const(
+                uint_mask_const,
+                uint_mask,
+                types::I32X4,
+            ));
+
+            // Creates 0x1.0p52 + double(src)
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Unpcklps,
+                RegMem::from(uint_mask),
+                dst,
+            ));
+
+            static UINT_MASK_HIGH: [u8; 16] = [
+                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                0x30, 0x43,
+            ];
+
+            let uint_mask_high_const =
+                ctx.use_constant(VCodeConstantData::WellKnown(&UINT_MASK_HIGH));
+            let uint_mask_high = ctx.alloc_tmp(types::I32X4).only_reg().unwrap();
+            ctx.emit(Inst::xmm_load_const(
+                uint_mask_high_const,
+                uint_mask_high,
+                types::I32X4,
+            ));
+
+            // 0x1.0p52 + double(src) - 0x1.0p52
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Subpd,
+                RegMem::from(uint_mask_high),
+                dst,
+            ));
+        }
         Opcode::FcvtFromUint => {
             let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
             let ty = ty.unwrap();
diff --git a/cranelift/codegen/src/isa/x64/mod.rs b/cranelift/codegen/src/isa/x64/mod.rs
index e4933c0586..381898e485 100644
--- a/cranelift/codegen/src/isa/x64/mod.rs
+++ b/cranelift/codegen/src/isa/x64/mod.rs
@@ -4,6 +4,8 @@ use self::inst::EmitInfo;
 
 use super::TargetIsa;
 use crate::ir::{condcodes::IntCC, Function};
+#[cfg(feature = "unwind")]
+use crate::isa::unwind::systemv;
 use crate::isa::x64::{inst::regs::create_reg_universe_systemv, settings as x64_settings};
 use crate::isa::Builder as IsaBuilder;
 use crate::machinst::{compile, MachBackend, MachCompileResult, TargetIsaAdapter, VCode};
@@ -11,12 +13,10 @@ use crate::result::CodegenResult;
 use crate::settings::{self as shared_settings, Flags};
 use alloc::{boxed::Box, vec::Vec};
 use core::hash::{Hash, Hasher};
+
 use regalloc::{PrettyPrint, RealRegUniverse, Reg};
 use target_lexicon::Triple;
 
-#[cfg(feature = "unwind")]
-use crate::isa::unwind::systemv;
-
 mod abi;
 pub mod encoding;
 mod inst;
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index 609603f624..9440632f44 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -565,6 +565,7 @@ where
         Opcode::FcvtFromUint => unimplemented!("FcvtFromUint"),
         Opcode::FcvtFromSint => unimplemented!("FcvtFromSint"),
         Opcode::FcvtLowFromSint => unimplemented!("FcvtLowFromSint"),
+        Opcode::FcvtLowFromUint => unimplemented!("FcvtLowFromUint"),
         Opcode::FvpromoteLow => unimplemented!("FvpromoteLow"),
         Opcode::Fvdemote => unimplemented!("Fvdemote"),
         Opcode::Isplit => unimplemented!("Isplit"),
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 6741de2c64..00fd529248 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1778,6 +1778,10 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = pop1_with_bitcast(state, I32X4, builder);
             state.push1(builder.ins().fcvt_low_from_sint(F64X2, a));
         }
+        Operator::F64x2ConvertLowI32x4U => {
+            let a = pop1_with_bitcast(state, I32X4, builder);
+            state.push1(builder.ins().fcvt_low_from_uint(F64X2, a));
+        }
         Operator::F64x2PromoteLowF32x4 => {
             let a = pop1_with_bitcast(state, F32X4, builder);
             state.push1(builder.ins().fvpromote_low(a));
@@ -1921,8 +1925,7 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::I16x8ExtAddPairwiseI8x16S
         | Operator::I16x8ExtAddPairwiseI8x16U
         | Operator::I32x4ExtAddPairwiseI16x8S
-        | Operator::I32x4ExtAddPairwiseI16x8U
-        | Operator::F64x2ConvertLowI32x4U => {
+        | Operator::I32x4ExtAddPairwiseI16x8U => {
             return Err(wasm_unsupported!("proposed simd operator {:?}", op));
         }
         Operator::ReturnCall { .. } | Operator::ReturnCallIndirect { .. } => {