From bb2dd5b68b367df3305a98914eb9baeafa80bc03 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Mon, 7 Dec 2020 13:59:31 -0800
Subject: [PATCH] [machinst x64]: implement load*_zero for x64

---
 cranelift/codegen/src/isa/x64/lower.rs        | 47 +++++++++++++++++++
 .../isa/x64/simd-lane-access-compile.clif     | 31 ++++++++++++
 2 files changed, 78 insertions(+)
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 6b779a81d8..39d6f9fb57 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -4095,6 +4095,53 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             emit_extract_lane(ctx, src, dst, lane, ty);
         }
 
+        Opcode::ScalarToVector => {
+            // When moving a scalar value to a vector register, we must be handle several
+            // situations:
+            //  1. a scalar float is already in an XMM register, so we simply move it
+            //  2. a scalar of any other type resides in a GPR register: MOVD moves the bits to an
+            //     XMM register and zeroes the upper bits
+            //  3. a scalar (float or otherwise) that has previously been loaded from memory (e.g.
+            //     the default lowering of Wasm's `load[32|64]_zero`) can be lowered to a single
+            //     MOVSS/MOVSD instruction; to do this, we rely on `input_to_reg_mem` to sink the
+            //     unused load.
+            let src = input_to_reg_mem(ctx, inputs[0]);
+            let src_ty = ctx.input_ty(insn, 0);
+            let dst = get_output_reg(ctx, outputs[0]);
+            let dst_ty = ty.unwrap();
+            assert!(src_ty == dst_ty.lane_type() && dst_ty.bits() == 128);
+            match src {
+                RegMem::Reg { reg } => {
+                    if src_ty.is_float() {
+                        // Case 1: when moving a scalar float, we simply move from one XMM register
+                        // to another, expecting the register allocator to elide this. Here we
+                        // assume that the upper bits of a scalar float have not been munged with
+                        // (the same assumption the old backend makes).
+                        ctx.emit(Inst::gen_move(dst, reg, dst_ty));
+                    } else {
+                        // Case 2: when moving a scalar value of any other type, use MOVD to zero
+                        // the upper lanes.
+                        let src_size = match src_ty.bits() {
+                            32 => OperandSize::Size32,
+                            64 => OperandSize::Size64,
+                            _ => unimplemented!("invalid source size for type: {}", src_ty),
+                        };
+                        ctx.emit(Inst::gpr_to_xmm(SseOpcode::Movd, src, src_size, dst));
+                    }
+                }
+                RegMem::Mem { .. } => {
+                    // Case 3: when presented with `load + scalar_to_vector`, coalesce into a single
+                    // MOVSS/MOVSD instruction.
+                    let opcode = match src_ty.bits() {
+                        32 => SseOpcode::Movss,
+                        64 => SseOpcode::Movsd,
+                        _ => unimplemented!("unable to move scalar to vector for type: {}", src_ty),
+                    };
+                    ctx.emit(Inst::xmm_mov(opcode, src, dst));
+                }
+            }
+        }
+
         Opcode::Splat => {
             let ty = ty.unwrap();
             assert_eq!(ty.bits(), 128);
diff --git a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
index f44dbd3b62..f451bd2a25 100644
--- a/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-lane-access-compile.clif
@@ -91,3 +91,34 @@ block0(v0: f64):
 ; check:  uninit  %xmm1
 ; nextln: movsd   %xmm0, %xmm1
 ; nextln: movlhps %xmm0, %xmm1
+
+
+
+;; load*_zero
+
+; Verify that a `load` followed by a `scalar_to_vector` (the CLIF translation of `load32_zero`) is
+; lowered to a single MOVSS instruction.
+function %load32_zero_coalesced(i64) -> i32x4 {
+block0(v0: i64):
+    v1 = load.i32 v0
+    v2 = scalar_to_vector.i32x4 v1
+    ; check:  movss   0(%rdi), %xmm0
+    return v2
+}
+
+;; Verify that `scalar_to_vector` (used by `load32_zero`), lowers as expected.
+function %load32_zero_int(i32) -> i32x4 {
+block0(v0: i32):
+    v1 = scalar_to_vector.i32x4 v0
+    ; check:  movd    %edi, %xmm0
+    return v1
+}
+function %load32_zero_float(f32) -> f32x4 {
+block0(v0: f32):
+    v1 = scalar_to_vector.f32x4 v0
+    ; regex: MOV=movap*
+    ; check: pushq
+    ; not: $MOV
+    ; check: ret
+    return v1
+}