From e0b911a4dfda7fe1d16da1fb1d9ee8eaed97acd7 Mon Sep 17 00:00:00 2001
From: Anton Kirilov <anton.kirilov@arm.com>
Date: Wed, 7 Oct 2020 11:29:55 +0100
Subject: [PATCH 1/3] Introduce the Cranelift IR instruction `LoadSplat`

It corresponds to WebAssembly's `load*_splat` operations, which
were previously represented as a combination of `Load` and `Splat`
instructions. However, there are architectures such as Armv8-A
that have a single machine instruction equivalent to the Wasm
operations. In order to generate it, it is necessary to merge the
`Load` and the `Splat` in the backend, which is not possible
because the load may have side effects. The new IR instruction
works around this limitation.

The AArch64 backend leverages the new instruction to improve code
generation.

Copyright (c) 2020, Arm Limited.
---
 .../codegen/meta/src/isa/x86/legalize.rs      |   2 +
 .../codegen/meta/src/shared/instructions.rs   |  19 +++
 .../codegen/src/isa/aarch64/inst/args.rs      |  15 ++
 .../codegen/src/isa/aarch64/inst/emit.rs      | 145 +++++++++++-------
 .../src/isa/aarch64/inst/emit_tests.rs        |  34 +++-
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  30 +++-
 .../codegen/src/isa/aarch64/lower_inst.rs     |  23 +++
 cranelift/codegen/src/isa/x86/enc_tables.rs   |  28 ++++
 cranelift/wasm/src/code_translator.rs         |  16 +-
 9 files changed, 237 insertions(+), 75 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index de78c3b3b7..681b3104d5 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -396,6 +396,7 @@ fn define_simd(
     let insertlane = insts.by_name("insertlane");
     let ishl = insts.by_name("ishl");
     let ishl_imm = insts.by_name("ishl_imm");
+    let load_splat = insts.by_name("load_splat");
     let raw_bitcast = insts.by_name("raw_bitcast");
     let scalar_to_vector = insts.by_name("scalar_to_vector");
     let splat = insts.by_name("splat");
@@ -820,6 +821,7 @@ fn define_simd(
     narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
     narrow.custom_legalize(fmin, "expand_minmax_vector");
     narrow.custom_legalize(fmax, "expand_minmax_vector");
+    narrow.custom_legalize(load_splat, "expand_load_splat");
 
     narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
     narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs
index 2c16734f27..9cb77493c7 100644
--- a/cranelift/codegen/meta/src/shared/instructions.rs
+++ b/cranelift/codegen/meta/src/shared/instructions.rs
@@ -4409,5 +4409,24 @@ pub(crate) fn define(
         .other_side_effects(true),
     );
 
+    let Offset = &Operand::new("Offset", &imm.offset32).with_doc("Byte offset from base address");
+    let a = &Operand::new("a", TxN);
+
+    ig.push(
+        Inst::new(
+            "load_splat",
+            r#"
+        Load an element from memory at ``p + Offset`` and return a vector
+        whose lanes are all set to that element.
+
+        This is equivalent to ``load`` followed by ``splat``.
+        "#,
+            &formats.load,
+        )
+        .operands_in(vec![MemFlags, p, Offset])
+        .operands_out(vec![a])
+        .can_load(true),
+    );
+
     ig.build()
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index f85c1028ff..95bf4bb63f 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -680,4 +680,19 @@ impl VectorSize {
             _ => *self,
         }
     }
+
+    /// Return the encoding bits that are used by some SIMD instructions
+    /// for a particular operand size.
+    pub fn enc_size(&self) -> (u32, u32) {
+        let q = self.is_128bits() as u32;
+        let size = match self.lane_size() {
+            ScalarSize::Size8 => 0b00,
+            ScalarSize::Size16 => 0b01,
+            ScalarSize::Size32 => 0b10,
+            ScalarSize::Size64 => 0b11,
+            _ => unreachable!(),
+        };
+
+        (q, size)
+    }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index d422fdc24f..124fd36c87 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -248,6 +248,16 @@ fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
     (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
 }
 
+fn enc_ldst_vec(q: u32, size: u32, rn: Reg, rt: Writable<Reg>) -> u32 {
+    debug_assert_eq!(q & 0b1, q);
+    debug_assert_eq!(size & 0b11, size);
+    0b0_0_0011010_10_00000_110_0_00_00000_00000
+        | q << 30
+        | size << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_vec(rt.to_reg())
+}
+
 fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
     (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
 }
@@ -1380,14 +1390,7 @@ impl MachInstEmit for Inst {
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
             &Inst::VecMisc { op, rd, rn, size } => {
-                let enc_size = match size.lane_size() {
-                    ScalarSize::Size8 => 0b00,
-                    ScalarSize::Size16 => 0b01,
-                    ScalarSize::Size32 => 0b10,
-                    ScalarSize::Size64 => 0b11,
-                    _ => unreachable!(),
-                };
-                let q = if size.is_128bits() { 1 } else { 0 };
+                let (q, enc_size) = size.enc_size();
                 let (u, bits_12_16, size) = match op {
                     VecMisc2::Not => (0b1, 0b00101, 0b00),
                     VecMisc2::Neg => (0b1, 0b01011, enc_size),
@@ -1756,13 +1759,7 @@ impl MachInstEmit for Inst {
                 alu_op,
                 size,
             } => {
-                let enc_size = match size.lane_size() {
-                    ScalarSize::Size8 => 0b00,
-                    ScalarSize::Size16 => 0b01,
-                    ScalarSize::Size32 => 0b10,
-                    ScalarSize::Size64 => 0b11,
-                    _ => unreachable!(),
-                };
+                let (q, enc_size) = size.enc_size();
                 let is_float = match alu_op {
                     VecALUOp::Fcmeq
                     | VecALUOp::Fcmgt
@@ -1776,6 +1773,7 @@ impl MachInstEmit for Inst {
                     _ => false,
                 };
                 let enc_float_size = match (is_float, size) {
+                    (true, VectorSize::Size32x2) => 0b0,
                     (true, VectorSize::Size32x4) => 0b0,
                     (true, VectorSize::Size64x2) => 0b1,
                     (true, _) => unimplemented!(),
@@ -1783,58 +1781,73 @@ impl MachInstEmit for Inst {
                 };
 
                 let (top11, bit15_10) = match alu_op {
-                    VecALUOp::Sqadd => (0b010_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::Sqsub => (0b010_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::Uqadd => (0b011_01110_00_1 | enc_size << 1, 0b000011),
-                    VecALUOp::Uqsub => (0b011_01110_00_1 | enc_size << 1, 0b001011),
-                    VecALUOp::Cmeq => (0b011_01110_00_1 | enc_size << 1, 0b100011),
-                    VecALUOp::Cmge => (0b010_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Cmgt => (0b010_01110_00_1 | enc_size << 1, 0b001101),
-                    VecALUOp::Cmhi => (0b011_01110_00_1 | enc_size << 1, 0b001101),
-                    VecALUOp::Cmhs => (0b011_01110_00_1 | enc_size << 1, 0b001111),
-                    VecALUOp::Fcmeq => (0b010_01110_00_1, 0b111001),
-                    VecALUOp::Fcmgt => (0b011_01110_10_1, 0b111001),
-                    VecALUOp::Fcmge => (0b011_01110_00_1, 0b111001),
+                    VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Sqsub => (0b000_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Uqadd => (0b001_01110_00_1 | enc_size << 1, 0b000011),
+                    VecALUOp::Uqsub => (0b001_01110_00_1 | enc_size << 1, 0b001011),
+                    VecALUOp::Cmeq => (0b001_01110_00_1 | enc_size << 1, 0b100011),
+                    VecALUOp::Cmge => (0b000_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Cmgt => (0b000_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhi => (0b001_01110_00_1 | enc_size << 1, 0b001101),
+                    VecALUOp::Cmhs => (0b001_01110_00_1 | enc_size << 1, 0b001111),
+                    VecALUOp::Fcmeq => (0b000_01110_00_1, 0b111001),
+                    VecALUOp::Fcmgt => (0b001_01110_10_1, 0b111001),
+                    VecALUOp::Fcmge => (0b001_01110_00_1, 0b111001),
                     // The following logical instructions operate on bytes, so are not encoded differently
                     // for the different vector types.
-                    VecALUOp::And => (0b010_01110_00_1, 0b000111),
-                    VecALUOp::Bic => (0b010_01110_01_1, 0b000111),
-                    VecALUOp::Orr => (0b010_01110_10_1, 0b000111),
-                    VecALUOp::Eor => (0b011_01110_00_1, 0b000111),
-                    VecALUOp::Bsl => (0b011_01110_01_1, 0b000111),
-                    VecALUOp::Umaxp => (0b011_01110_00_1 | enc_size << 1, 0b101001),
-                    VecALUOp::Add => (0b010_01110_00_1 | enc_size << 1, 0b100001),
-                    VecALUOp::Sub => (0b011_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::And => (0b000_01110_00_1, 0b000111),
+                    VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
+                    VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
+                    VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
+                    VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
+                    VecALUOp::Umaxp => (0b001_01110_00_1 | enc_size << 1, 0b101001),
+                    VecALUOp::Add => (0b000_01110_00_1 | enc_size << 1, 0b100001),
+                    VecALUOp::Sub => (0b001_01110_00_1 | enc_size << 1, 0b100001),
                     VecALUOp::Mul => {
                         debug_assert_ne!(size, VectorSize::Size64x2);
-                        (0b010_01110_00_1 | enc_size << 1, 0b100111)
+                        (0b000_01110_00_1 | enc_size << 1, 0b100111)
                     }
-                    VecALUOp::Sshl => (0b010_01110_00_1 | enc_size << 1, 0b010001),
-                    VecALUOp::Ushl => (0b011_01110_00_1 | enc_size << 1, 0b010001),
-                    VecALUOp::Umin => (0b011_01110_00_1 | enc_size << 1, 0b011011),
-                    VecALUOp::Smin => (0b010_01110_00_1 | enc_size << 1, 0b011011),
-                    VecALUOp::Umax => (0b011_01110_00_1 | enc_size << 1, 0b011001),
-                    VecALUOp::Smax => (0b010_01110_00_1 | enc_size << 1, 0b011001),
-                    VecALUOp::Urhadd => (0b011_01110_00_1 | enc_size << 1, 0b000101),
-                    VecALUOp::Fadd => (0b010_01110_00_1, 0b110101),
-                    VecALUOp::Fsub => (0b010_01110_10_1, 0b110101),
-                    VecALUOp::Fdiv => (0b011_01110_00_1, 0b111111),
-                    VecALUOp::Fmax => (0b010_01110_00_1, 0b111101),
-                    VecALUOp::Fmin => (0b010_01110_10_1, 0b111101),
-                    VecALUOp::Fmul => (0b011_01110_00_1, 0b110111),
-                    VecALUOp::Addp => (0b010_01110_00_1 | enc_size << 1, 0b101111),
+                    VecALUOp::Sshl => (0b000_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Ushl => (0b001_01110_00_1 | enc_size << 1, 0b010001),
+                    VecALUOp::Umin => (0b001_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Smin => (0b000_01110_00_1 | enc_size << 1, 0b011011),
+                    VecALUOp::Umax => (0b001_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Smax => (0b000_01110_00_1 | enc_size << 1, 0b011001),
+                    VecALUOp::Urhadd => (0b001_01110_00_1 | enc_size << 1, 0b000101),
+                    VecALUOp::Fadd => (0b000_01110_00_1, 0b110101),
+                    VecALUOp::Fsub => (0b000_01110_10_1, 0b110101),
+                    VecALUOp::Fdiv => (0b001_01110_00_1, 0b111111),
+                    VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
+                    VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
+                    VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
+                    VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
                     VecALUOp::Umlal => {
                         debug_assert!(!size.is_128bits());
                         (0b001_01110_00_1 | enc_size << 1, 0b100000)
                     }
                 };
                 let top11 = if is_float {
-                    top11 | enc_float_size << 1
+                    top11 | (q << 9) | enc_float_size << 1
                 } else {
-                    top11
+                    top11 | (q << 9)
                 };
                 sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
             }
+            &Inst::VecLoadReplicate {
+                rd,
+                rn,
+                size,
+                srcloc,
+            } => {
+                let (q, size) = size.enc_size();
+
+                if let Some(srcloc) = srcloc {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                }
+
+                sink.put4(enc_ldst_vec(q, size, rn, rd));
+            }
             &Inst::MovToNZCV { rn } => {
                 sink.put4(0xd51b4200 | machreg_to_gpr(rn));
             }
@@ -2119,9 +2132,12 @@ impl MachInstEmit for Inst {
                     inst.emit(sink, emit_info, state);
                 }
 
-                let (reg, offset) = match mem {
-                    AMode::Unscaled(r, simm9) => (r, simm9.value()),
-                    AMode::UnsignedOffset(r, uimm12scaled) => (r, uimm12scaled.value() as i32),
+                let (reg, index_reg, offset) = match mem {
+                    AMode::RegExtended(r, idx, extendop) => (r, Some((idx, extendop)), 0),
+                    AMode::Unscaled(r, simm9) => (r, None, simm9.value()),
+                    AMode::UnsignedOffset(r, uimm12scaled) => {
+                        (r, None, uimm12scaled.value() as i32)
+                    }
                     _ => panic!("Unsupported case for LoadAddr: {:?}", mem),
                 };
                 let abs_offset = if offset < 0 {
@@ -2135,9 +2151,22 @@ impl MachInstEmit for Inst {
                     ALUOp::Add64
                 };
 
-                if offset == 0 {
-                    let mov = Inst::mov(rd, reg);
-                    mov.emit(sink, emit_info, state);
+                if let Some((idx, extendop)) = index_reg {
+                    let add = Inst::AluRRRExtend {
+                        alu_op: ALUOp::Add64,
+                        rd,
+                        rn: reg,
+                        rm: idx,
+                        extendop,
+                    };
+
+                    add.emit(sink, emit_info, state);
+                } else if offset == 0 {
+                    if reg != rd.to_reg() {
+                        let mov = Inst::mov(rd, reg);
+
+                        mov.emit(sink, emit_info, state);
+                    }
                 } else if let Some(imm12) = Imm12::maybe_from_u64(abs_offset) {
                     let add = Inst::AluRRImm12 {
                         alu_op,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 6d981c2eaa..48707610ff 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2533,10 +2533,10 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(28),
             rn: vreg(12),
             rm: vreg(4),
-            size: VectorSize::Size32x4,
+            size: VectorSize::Size32x2,
         },
-        "9CE5244E",
-        "fcmeq v28.4s, v12.4s, v4.4s",
+        "9CE5240E",
+        "fcmeq v28.2s, v12.2s, v4.2s",
     ));
 
     insns.push((
@@ -2965,10 +2965,10 @@ fn test_aarch64_binemit() {
             rd: writable_vreg(6),
             rn: vreg(9),
             rm: vreg(8),
-            size: VectorSize::Size8x16,
+            size: VectorSize::Size8x8,
         },
-        "2665286E",
-        "umax v6.16b, v9.16b, v8.16b",
+        "2665282E",
+        "umax v6.8b, v9.8b, v8.8b",
     ));
 
     insns.push((
@@ -3507,6 +3507,28 @@ fn test_aarch64_binemit() {
         "tbx v3.16b, { v11.16b, v12.16b }, v19.16b",
     ));
 
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(31),
+            rn: xreg(0),
+            srcloc: None,
+            size: VectorSize::Size64x2,
+        },
+        "1FCC404D",
+        "ld1r { v31.2d }, [x0]",
+    ));
+
+    insns.push((
+        Inst::VecLoadReplicate {
+            rd: writable_vreg(0),
+            rn: xreg(25),
+            srcloc: None,
+            size: VectorSize::Size8x8,
+        },
+        "20C3400D",
+        "ld1r { v0.8b }, [x25]",
+    ));
+
     insns.push((
         Inst::Extend {
             rd: writable_xreg(1),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index 544d04c23c..e9c0f15425 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -975,6 +975,14 @@ pub enum Inst {
         is_extension: bool,
     },
 
+    /// Load an element and replicate to all lanes of a vector.
+    VecLoadReplicate {
+        rd: Writable<Reg>,
+        rn: Reg,
+        size: VectorSize,
+        srcloc: Option<SourceLoc>,
+    },
+
     /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
     MovToNZCV {
         rn: Reg,
@@ -1609,7 +1617,10 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
                 collector.add_def(rd);
             }
         }
-
+        &Inst::VecLoadReplicate { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
         &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
             collector.add_use(rn);
             collector.add_use(rm);
@@ -1762,8 +1773,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::LoadExtName { rd, .. } => {
             collector.add_def(rd);
         }
-        &Inst::LoadAddr { rd, mem: _ } => {
+        &Inst::LoadAddr { rd, ref mem } => {
             collector.add_def(rd);
+            memarg_regs(mem, collector);
         }
         &Inst::VirtualSPOffsetAdj { .. } => {}
         &Inst::EmitIsland { .. } => {}
@@ -2189,6 +2201,14 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
                 map_def(mapper, rd);
             }
         }
+        &mut Inst::VecLoadReplicate {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_def(mapper, rd);
+            map_use(mapper, rn);
+        }
         &mut Inst::FpuCmp32 {
             ref mut rn,
             ref mut rm,
@@ -3412,6 +3432,12 @@ impl Inst {
                 let rm = show_vreg_vector(rm, mb_rru, VectorSize::Size8x16);
                 format!("{} {}, {{ {}, {} }}, {}", op, rd, rn, rn2, rm)
             }
+            &Inst::VecLoadReplicate { rd, rn, size, .. } => {
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+                let rn = rn.show_rru(mb_rru);
+
+                format!("ld1r {{ {} }}, [{}]", rd, rn)
+            }
             &Inst::MovToNZCV { rn } => {
                 let rn = rn.show_rru(mb_rru);
                 format!("msr nzcv, {}", rn)
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index fc28cb3581..ecdcb9c6d1 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1197,6 +1197,29 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
+        Opcode::LoadSplat => {
+            let off = ctx.data(insn).load_store_offset().unwrap();
+            let ty = ty.unwrap();
+            let mem = lower_address(ctx, ty.lane_type(), &inputs[..], off);
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let rd = get_output_reg(ctx, outputs[0]);
+            let size = VectorSize::from_ty(ty);
+            let srcloc = if memflags.notrap() {
+                None
+            } else {
+                Some(ctx.srcloc(insn))
+            };
+            let tmp = ctx.alloc_tmp(RegClass::I64, I64);
+
+            ctx.emit(Inst::LoadAddr { rd: tmp, mem });
+            ctx.emit(Inst::VecLoadReplicate {
+                rd,
+                rn: tmp.to_reg(),
+                size,
+                srcloc,
+            });
+        }
+
         Opcode::Store
         | Opcode::Istore8
         | Opcode::Istore16
diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
index 72890cffd9..976f1581e3 100644
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -1892,3 +1892,31 @@ fn expand_tls_value(
         unreachable!();
     }
 }
+
+fn expand_load_splat(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    _isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+
+    pos.use_srcloc(inst);
+
+    let (ptr, offset, flags) = match pos.func.dfg[inst] {
+        ir::InstructionData::Load {
+            opcode: ir::Opcode::LoadSplat,
+            arg,
+            offset,
+            flags,
+        } => (arg, offset, flags),
+        _ => panic!(
+            "Expected load_splat: {}",
+            pos.func.dfg.display_inst(inst, None)
+        ),
+    };
+    let ty = pos.func.dfg.ctrl_typevar(inst);
+    let load = pos.ins().load(ty.lane_type(), flags, ptr, offset);
+
+    pos.func.dfg.replace(inst).splat(ty, load);
+}
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index 7c827802ba..ef1804cf12 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1380,19 +1380,17 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
         | Operator::V128Load16Splat { memarg }
         | Operator::V128Load32Splat { memarg }
         | Operator::V128Load64Splat { memarg } => {
-            // TODO: For spec compliance, this is initially implemented as a combination of `load +
-            // splat` but could be implemented eventually as a single instruction (`load_splat`).
-            // See https://github.com/bytecodealliance/wasmtime/issues/1175.
-            translate_load(
+            let opcode = ir::Opcode::LoadSplat;
+            let result_ty = type_of(op);
+            let (flags, base, offset) = prepare_load(
                 memarg,
-                ir::Opcode::Load,
-                type_of(op).lane_type(),
+                mem_op_size(opcode, result_ty.lane_type()),
                 builder,
                 state,
                 environ,
             )?;
-            let splatted = builder.ins().splat(type_of(op), state.pop1());
-            state.push1(splatted)
+            let (load, dfg) = builder.ins().Load(opcode, result_ty, flags, offset, base);
+            state.push1(dfg.first_result(load))
         }
         Operator::I8x16ExtractLaneS { lane } | Operator::I16x8ExtractLaneS { lane } => {
             let vector = pop1_with_bitcast(state, type_of(op), builder);
@@ -2040,7 +2038,7 @@ fn mem_op_size(opcode: ir::Opcode, ty: Type) -> u32 {
         ir::Opcode::Istore8 | ir::Opcode::Sload8 | ir::Opcode::Uload8 => 1,
         ir::Opcode::Istore16 | ir::Opcode::Sload16 | ir::Opcode::Uload16 => 2,
         ir::Opcode::Istore32 | ir::Opcode::Sload32 | ir::Opcode::Uload32 => 4,
-        ir::Opcode::Store | ir::Opcode::Load => ty.bytes(),
+        ir::Opcode::Store | ir::Opcode::Load | ir::Opcode::LoadSplat => ty.bytes(),
         _ => panic!("unknown size of mem op for {:?}", opcode),
     }
 }

From d990dd4c9a82ea3c924b228fe4f0bc2a74b24c3a Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 13 Oct 2020 10:02:12 -0700
Subject: [PATCH 2/3] [machinst x64]: add source locations to more instruction
 formats

In order to register traps for `load_splat`, several instruction formats need knowledge of `SourceLoc`s; however, since the x64 backend does not correctly and completely register traps for `RegMem::Mem` variants I opened https://github.com/bytecodealliance/wasmtime/issues/2290 to discuss and resolve this issue. In the meantime, the current behavior (i.e. remaining largely unaware of `SourceLoc`s) is retained.
---
 cranelift/codegen/src/isa/x64/inst/emit.rs    |  42 +++-
 .../codegen/src/isa/x64/inst/emit_tests.rs    | 104 ++++-----
 cranelift/codegen/src/isa/x64/inst/mod.rs     |  86 ++++---
 cranelift/codegen/src/isa/x64/lower.rs        | 216 +++++++++++++-----
 4 files changed, 303 insertions(+), 145 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs
index 688e620d83..397b21f69d 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1728,6 +1728,7 @@ pub(crate) fn emit(
             op,
             src: src_e,
             dst: reg_g,
+            srcloc,
         } => {
             let rex = RexFlags::clear_w();
             let (prefix, opcode, length) = match op {
@@ -1819,6 +1820,10 @@ pub(crate) fn emit(
                     emit_std_reg_reg(sink, prefix, opcode, length, reg_g.to_reg(), *reg_e, rex);
                 }
                 RegMem::Mem { addr } => {
+                    if let Some(srcloc) = *srcloc {
+                        // Register the offset at which the actual load instruction starts.
+                        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                    }
                     let addr = &addr.finalize(state);
                     emit_std_reg_mem(sink, prefix, opcode, length, reg_g.to_reg(), addr, rex);
                 }
@@ -1889,7 +1894,7 @@ pub(crate) fn emit(
             // and negative zero. These instructions merge the sign bits in that
             // case, and are no-ops otherwise.
             let op = if *is_min { or_op } else { and_op };
-            let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst);
+            let inst = Inst::xmm_rm_r(op, RegMem::reg(*lhs), *rhs_dst, None);
             inst.emit(sink, info, state);
 
             let inst = Inst::jmp_known(done);
@@ -1899,13 +1904,13 @@ pub(crate) fn emit(
             // read-only operand: perform an addition between the two operands, which has the
             // desired NaN propagation effects.
             sink.bind_label(propagate_nan);
-            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst);
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(*lhs), *rhs_dst, None);
             inst.emit(sink, info, state);
 
             one_way_jmp(sink, CC::P, done);
 
             sink.bind_label(do_min_max);
-            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst);
+            let inst = Inst::xmm_rm_r(min_max_op, RegMem::reg(*lhs), *rhs_dst, None);
             inst.emit(sink, info, state);
 
             sink.bind_label(done);
@@ -1916,7 +1921,8 @@ pub(crate) fn emit(
             src,
             dst,
             imm,
-            is64: w,
+            is64,
+            srcloc,
         } => {
             let (prefix, opcode, len) = match op {
                 SseOpcode::Cmpps => (LegacyPrefixes::None, 0x0FC2, 2),
@@ -1933,7 +1939,7 @@ pub(crate) fn emit(
                 SseOpcode::Pshufd => (LegacyPrefixes::_66, 0x0F70, 2),
                 _ => unimplemented!("Opcode {:?} not implemented", op),
             };
-            let rex = if *w {
+            let rex = if *is64 {
                 RexFlags::set_w()
             } else {
                 RexFlags::clear_w()
@@ -1955,6 +1961,10 @@ pub(crate) fn emit(
                     }
                 }
                 RegMem::Mem { addr } => {
+                    if let Some(srcloc) = *srcloc {
+                        // Register the offset at which the actual load instruction starts.
+                        sink.add_trap(srcloc, TrapCode::HeapOutOfBounds);
+                    }
                     let addr = &addr.finalize(state);
                     assert!(
                         !regs_swapped,
@@ -1963,7 +1973,7 @@ pub(crate) fn emit(
                     emit_std_reg_mem(sink, prefix, opcode, len, dst.to_reg(), addr, rex);
                 }
             }
-            sink.put1(*imm)
+            sink.put1(*imm);
         }
 
         Inst::XmmLoadConstSeq { val, dst, ty } => {
@@ -2188,7 +2198,7 @@ pub(crate) fn emit(
             } else {
                 SseOpcode::Addss
             };
-            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst);
+            let inst = Inst::xmm_rm_r(add_op, RegMem::reg(dst.to_reg()), *dst, None);
             inst.emit(sink, info, state);
 
             sink.bind_label(done);
@@ -2295,8 +2305,12 @@ pub(crate) fn emit(
                 // If the input was positive, saturate to INT_MAX.
 
                 // Zero out tmp_xmm.
-                let inst =
-                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                let inst = Inst::xmm_rm_r(
+                    SseOpcode::Xorpd,
+                    RegMem::reg(tmp_xmm.to_reg()),
+                    *tmp_xmm,
+                    None,
+                );
                 inst.emit(sink, info, state);
 
                 let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
@@ -2367,8 +2381,12 @@ pub(crate) fn emit(
                 sink.bind_label(check_positive);
 
                 // Zero out the tmp_xmm register.
-                let inst =
-                    Inst::xmm_rm_r(SseOpcode::Xorpd, RegMem::reg(tmp_xmm.to_reg()), *tmp_xmm);
+                let inst = Inst::xmm_rm_r(
+                    SseOpcode::Xorpd,
+                    RegMem::reg(tmp_xmm.to_reg()),
+                    *tmp_xmm,
+                    None,
+                );
                 inst.emit(sink, info, state);
 
                 let inst = Inst::xmm_cmp_rm_r(cmp_op, RegMem::reg(src), tmp_xmm.to_reg());
@@ -2522,7 +2540,7 @@ pub(crate) fn emit(
 
             sink.bind_label(handle_large);
 
-            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src);
+            let inst = Inst::xmm_rm_r(sub_op, RegMem::reg(tmp_xmm.to_reg()), *src, None);
             inst.emit(sink, info, state);
 
             let inst = Inst::xmm_to_gpr(trunc_op, src.to_reg(), *dst, *dst_size);
diff --git a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
index 62992be2bd..71120a101d 100644
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -2983,12 +2983,12 @@ fn test_x64_emit() {
     // XMM_RM_R: float binary ops
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0),
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm1), w_xmm0, None),
         "F30F58C1",
         "addss   %xmm1, %xmm0",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Addss, RegMem::reg(xmm11), w_xmm13, None),
         "F3450F58EB",
         "addss   %xmm11, %xmm13",
     ));
@@ -2997,23 +2997,24 @@ fn test_x64_emit() {
             SseOpcode::Addss,
             RegMem::mem(Amode::imm_reg_reg_shift(123, r10, rdx, 2)),
             w_xmm0,
+            None,
         ),
         "F3410F5844927B",
         "addss   123(%r10,%rdx,4), %xmm0",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Addsd, RegMem::reg(xmm15), w_xmm4, None),
         "F2410F58E7",
         "addsd   %xmm15, %xmm4",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm0), w_xmm1, None),
         "F30F5CC8",
         "subss   %xmm0, %xmm1",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Subss, RegMem::reg(xmm12), w_xmm1, None),
         "F3410F5CCC",
         "subss   %xmm12, %xmm1",
     ));
@@ -3022,57 +3023,58 @@ fn test_x64_emit() {
             SseOpcode::Subss,
             RegMem::mem(Amode::imm_reg_reg_shift(321, r10, rax, 3)),
             w_xmm10,
+            None,
         ),
         "F3450F5C94C241010000",
         "subss   321(%r10,%rax,8), %xmm10",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14),
+        Inst::xmm_rm_r(SseOpcode::Subsd, RegMem::reg(xmm5), w_xmm14, None),
         "F2440F5CF5",
         "subsd   %xmm5, %xmm14",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Mulss, RegMem::reg(xmm5), w_xmm4, None),
         "F30F59E5",
         "mulss   %xmm5, %xmm4",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Mulsd, RegMem::reg(xmm5), w_xmm4, None),
         "F20F59E5",
         "mulsd   %xmm5, %xmm4",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7),
+        Inst::xmm_rm_r(SseOpcode::Divss, RegMem::reg(xmm8), w_xmm7, None),
         "F3410F5EF8",
         "divss   %xmm8, %xmm7",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Divsd, RegMem::reg(xmm5), w_xmm4, None),
         "F20F5EE5",
         "divsd   %xmm5, %xmm4",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12),
+        Inst::xmm_rm_r(SseOpcode::Andps, RegMem::reg(xmm3), w_xmm12, None),
         "440F54E3",
         "andps   %xmm3, %xmm12",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11),
+        Inst::xmm_rm_r(SseOpcode::Andnps, RegMem::reg(xmm4), w_xmm11, None),
         "440F55DC",
         "andnps  %xmm4, %xmm11",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15),
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm1), w_xmm15, None),
         "440F56F9",
         "orps    %xmm1, %xmm15",
     ));
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4),
+        Inst::xmm_rm_r(SseOpcode::Orps, RegMem::reg(xmm5), w_xmm4, None),
         "0F56E5",
         "orps    %xmm5, %xmm4",
     ));
@@ -3081,211 +3083,211 @@ fn test_x64_emit() {
     // XMM_RM_R: Integer Packed
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5),
+        Inst::xmm_rm_r(SseOpcode::Paddb, RegMem::reg(xmm9), w_xmm5, None),
         "66410FFCE9",
         "paddb   %xmm9, %xmm5",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Paddw, RegMem::reg(xmm7), w_xmm6, None),
         "660FFDF7",
         "paddw   %xmm7, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Paddd, RegMem::reg(xmm12), w_xmm13, None),
         "66450FFEEC",
         "paddd   %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Paddq, RegMem::reg(xmm1), w_xmm8, None),
         "66440FD4C1",
         "paddq   %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5),
+        Inst::xmm_rm_r(SseOpcode::Paddsb, RegMem::reg(xmm9), w_xmm5, None),
         "66410FECE9",
         "paddsb  %xmm9, %xmm5",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Paddsw, RegMem::reg(xmm7), w_xmm6, None),
         "660FEDF7",
         "paddsw  %xmm7, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Paddusb, RegMem::reg(xmm12), w_xmm13, None),
         "66450FDCEC",
         "paddusb %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Paddusw, RegMem::reg(xmm1), w_xmm8, None),
         "66440FDDC1",
         "paddusw %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5),
+        Inst::xmm_rm_r(SseOpcode::Psubsb, RegMem::reg(xmm9), w_xmm5, None),
         "66410FE8E9",
         "psubsb  %xmm9, %xmm5",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Psubsw, RegMem::reg(xmm7), w_xmm6, None),
         "660FE9F7",
         "psubsw  %xmm7, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Psubusb, RegMem::reg(xmm12), w_xmm13, None),
         "66450FD8EC",
         "psubusb %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Psubusw, RegMem::reg(xmm1), w_xmm8, None),
         "66440FD9C1",
         "psubusw %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13),
+        Inst::xmm_rm_r(SseOpcode::Pavgb, RegMem::reg(xmm12), w_xmm13, None),
         "66450FE0EC",
         "pavgb   %xmm12, %xmm13",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8),
+        Inst::xmm_rm_r(SseOpcode::Pavgw, RegMem::reg(xmm1), w_xmm8, None),
         "66440FE3C1",
         "pavgw   %xmm1, %xmm8",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Psubb, RegMem::reg(xmm5), w_xmm9, None),
         "66440FF8CD",
         "psubb   %xmm5, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7),
+        Inst::xmm_rm_r(SseOpcode::Psubw, RegMem::reg(xmm6), w_xmm7, None),
         "660FF9FE",
         "psubw   %xmm6, %xmm7",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12),
+        Inst::xmm_rm_r(SseOpcode::Psubd, RegMem::reg(xmm13), w_xmm12, None),
         "66450FFAE5",
         "psubd   %xmm13, %xmm12",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Psubq, RegMem::reg(xmm8), w_xmm1, None),
         "66410FFBC8",
         "psubq   %xmm8, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmulld, RegMem::reg(xmm15), w_xmm6, None),
         "66410F3840F7",
         "pmulld  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmullw, RegMem::reg(xmm14), w_xmm1, None),
         "66410FD5CE",
         "pmullw  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(xmm8), w_xmm9, None),
         "66450FF4C8",
         "pmuludq %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmaxsb, RegMem::reg(xmm15), w_xmm6, None),
         "66410F383CF7",
         "pmaxsb  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmaxsw, RegMem::reg(xmm15), w_xmm6, None),
         "66410FEEF7",
         "pmaxsw  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6),
+        Inst::xmm_rm_r(SseOpcode::Pmaxsd, RegMem::reg(xmm15), w_xmm6, None),
         "66410F383DF7",
         "pmaxsd  %xmm15, %xmm6",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmaxub, RegMem::reg(xmm14), w_xmm1, None),
         "66410FDECE",
         "pmaxub  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmaxuw, RegMem::reg(xmm14), w_xmm1, None),
         "66410F383ECE",
         "pmaxuw  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1),
+        Inst::xmm_rm_r(SseOpcode::Pmaxud, RegMem::reg(xmm14), w_xmm1, None),
         "66410F383FCE",
         "pmaxud  %xmm14, %xmm1",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pminsb, RegMem::reg(xmm8), w_xmm9, None),
         "66450F3838C8",
         "pminsb  %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pminsw, RegMem::reg(xmm8), w_xmm9, None),
         "66450FEAC8",
         "pminsw  %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9),
+        Inst::xmm_rm_r(SseOpcode::Pminsd, RegMem::reg(xmm8), w_xmm9, None),
         "66450F3839C8",
         "pminsd  %xmm8, %xmm9",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pminub, RegMem::reg(xmm3), w_xmm2, None),
         "660FDAD3",
         "pminub  %xmm3, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pminuw, RegMem::reg(xmm3), w_xmm2, None),
         "660F383AD3",
         "pminuw  %xmm3, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pminud, RegMem::reg(xmm3), w_xmm2, None),
         "660F383BD3",
         "pminud  %xmm3, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::reg(xmm11), w_xmm2, None),
         "66410FEFD3",
         "pxor    %xmm11, %xmm2",
     ));
 
     insns.push((
-        Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2),
+        Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::reg(xmm11), w_xmm2, None),
         "66410F3800D3",
         "pshufb  %xmm11, %xmm2",
     ));
@@ -3488,12 +3490,12 @@ fn test_x64_emit() {
     // ========================================================
     // XmmRmRImm
     insns.push((
-        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false),
+        Inst::xmm_rm_r_imm(SseOpcode::Cmppd, RegMem::reg(xmm5), w_xmm1, 2, false, None),
         "660FC2CD02",
         "cmppd   $2, %xmm5, %xmm1",
     ));
     insns.push((
-        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false),
+        Inst::xmm_rm_r_imm(SseOpcode::Cmpps, RegMem::reg(xmm15), w_xmm7, 0, false, None),
         "410FC2FF00",
         "cmpps   $0, %xmm15, %xmm7",
     ));
diff --git a/cranelift/codegen/src/isa/x64/inst/mod.rs b/cranelift/codegen/src/isa/x64/inst/mod.rs
index 1fe0de6941..aac925db62 100644
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -212,6 +212,7 @@ pub enum Inst {
         op: SseOpcode,
         src: RegMem,
         dst: Writable<Reg>,
+        srcloc: Option<SourceLoc>,
     },
 
     /// XMM (scalar or vector) unary op: mov between XMM registers (32 64) (reg addr) reg, sqrt,
@@ -338,6 +339,7 @@ pub enum Inst {
         dst: Writable<Reg>,
         imm: u8,
         is64: bool,
+        srcloc: Option<SourceLoc>,
     },
 
     // =====================================
@@ -711,10 +713,20 @@ impl Inst {
         }
     }
 
-    pub(crate) fn xmm_rm_r(op: SseOpcode, src: RegMem, dst: Writable<Reg>) -> Self {
+    pub(crate) fn xmm_rm_r(
+        op: SseOpcode,
+        src: RegMem,
+        dst: Writable<Reg>,
+        srcloc: Option<SourceLoc>,
+    ) -> Self {
         src.assert_regclass_is(RegClass::V128);
         debug_assert!(dst.to_reg().get_class() == RegClass::V128);
-        Inst::XmmRmR { op, src, dst }
+        Inst::XmmRmR {
+            op,
+            src,
+            dst,
+            srcloc,
+        }
     }
 
     pub(crate) fn xmm_uninit_value(dst: Writable<Reg>) -> Self {
@@ -869,6 +881,7 @@ impl Inst {
         dst: Writable<Reg>,
         imm: u8,
         is64: bool,
+        srcloc: Option<SourceLoc>,
     ) -> Inst {
         Inst::XmmRmRImm {
             op,
@@ -876,6 +889,7 @@ impl Inst {
             dst,
             imm,
             is64,
+            srcloc,
         }
     }
 
@@ -1233,16 +1247,26 @@ impl Inst {
     /// Choose which instruction to use for comparing two values for equality.
     pub(crate) fn equals(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to),
-            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to),
-            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to),
-            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to),
-            types::F32X4 => {
-                Inst::xmm_rm_r_imm(SseOpcode::Cmpps, from, to, FcmpImm::Equal.encode(), false)
-            }
-            types::F64X2 => {
-                Inst::xmm_rm_r_imm(SseOpcode::Cmppd, from, to, FcmpImm::Equal.encode(), false)
-            }
+            types::I8X16 | types::B8X16 => Inst::xmm_rm_r(SseOpcode::Pcmpeqb, from, to, None),
+            types::I16X8 | types::B16X8 => Inst::xmm_rm_r(SseOpcode::Pcmpeqw, from, to, None),
+            types::I32X4 | types::B32X4 => Inst::xmm_rm_r(SseOpcode::Pcmpeqd, from, to, None),
+            types::I64X2 | types::B64X2 => Inst::xmm_rm_r(SseOpcode::Pcmpeqq, from, to, None),
+            types::F32X4 => Inst::xmm_rm_r_imm(
+                SseOpcode::Cmpps,
+                from,
+                to,
+                FcmpImm::Equal.encode(),
+                false,
+                None,
+            ),
+            types::F64X2 => Inst::xmm_rm_r_imm(
+                SseOpcode::Cmppd,
+                from,
+                to,
+                FcmpImm::Equal.encode(),
+                false,
+                None,
+            ),
             _ => unimplemented!("unimplemented type for Inst::equals: {}", ty),
         }
     }
@@ -1250,9 +1274,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise AND on two values.
     pub(crate) fn and(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pand, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Pand, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::and: {}", ty),
         }
     }
@@ -1260,9 +1286,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise AND NOT on two values.
     pub(crate) fn and_not(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pandn, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Andnps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Andnpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Pandn, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::and_not: {}", ty),
         }
     }
@@ -1270,9 +1298,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise OR on two values.
     pub(crate) fn or(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Por, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Orps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Orpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Por, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::or: {}", ty),
         }
     }
@@ -1280,9 +1310,11 @@ impl Inst {
     /// Choose which instruction to use for computing a bitwise XOR on two values.
     pub(crate) fn xor(ty: Type, from: RegMem, to: Writable<Reg>) -> Inst {
         match ty {
-            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to),
-            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to),
-            _ if ty.is_vector() && ty.bits() == 128 => Inst::xmm_rm_r(SseOpcode::Pxor, from, to),
+            types::F32X4 => Inst::xmm_rm_r(SseOpcode::Xorps, from, to, None),
+            types::F64X2 => Inst::xmm_rm_r(SseOpcode::Xorpd, from, to, None),
+            _ if ty.is_vector() && ty.bits() == 128 => {
+                Inst::xmm_rm_r(SseOpcode::Pxor, from, to, None)
+            }
             _ => unimplemented!("unimplemented type for Inst::xor: {}", ty),
         }
     }
@@ -1429,7 +1461,7 @@ impl PrettyPrint for Inst {
                 dst.show_rru(mb_rru),
             ),
 
-            Inst::XmmRmR { op, src, dst } => format!(
+            Inst::XmmRmR { op, src, dst, .. } => format!(
                 "{} {}, {}",
                 ljustify(op.to_string()),
                 src.show_rru_sized(mb_rru, 8),
@@ -1459,7 +1491,7 @@ impl PrettyPrint for Inst {
                 show_ireg_sized(rhs_dst.to_reg(), mb_rru, 8),
             ),
 
-            Inst::XmmRmRImm { op, src, dst, imm, is64 } => format!(
+            Inst::XmmRmRImm { op, src, dst, imm, is64, .. } => format!(
                 "{} ${}, {}, {}",
                 ljustify(format!("{}{}", op.to_string(), if *is64 { ".w" } else { "" })),
                 imm,
@@ -2595,6 +2627,7 @@ impl MachInst for Inst {
                     SseOpcode::Xorps,
                     RegMem::reg(to_reg.to_reg()),
                     to_reg,
+                    None,
                 ));
             } else {
                 let tmp = alloc_tmp(RegClass::I64, types::I32);
@@ -2613,6 +2646,7 @@ impl MachInst for Inst {
                     SseOpcode::Xorpd,
                     RegMem::reg(to_reg.to_reg()),
                     to_reg,
+                    None,
                 ));
             } else {
                 let tmp = alloc_tmp(RegClass::I64, types::I64);
diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 108072b97c..576b875515 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -3,7 +3,7 @@
 use crate::data_value::DataValue;
 use crate::ir::{
     condcodes::FloatCC, condcodes::IntCC, types, AbiParam, ArgumentPurpose, ExternalName,
-    Inst as IRInst, InstructionData, LibCall, Opcode, Signature, Type,
+    Inst as IRInst, InstructionData, LibCall, Opcode, Signature, SourceLoc, Type,
 };
 use crate::isa::x64::abi::*;
 use crate::isa::x64::inst::args::*;
@@ -227,6 +227,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
     dst: Writable<Reg>,
     lane: u8,
     ty: Type,
+    srcloc: Option<SourceLoc>,
 ) {
     if !ty.is_float() {
         let (sse_op, is64) = match ty.lane_bits() {
@@ -236,13 +237,13 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
             64 => (SseOpcode::Pinsrd, true),
             _ => panic!("Unable to insertlane for lane size: {}", ty.lane_bits()),
         };
-        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64));
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, is64, srcloc));
     } else if ty == types::F32 {
         let sse_op = SseOpcode::Insertps;
         // Insert 32-bits from replacement (at index 00, bits 7:8) to vector (lane
         // shifted into bits 5:6).
         let lane = 0b00_00_00_00 | lane << 4;
-        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false));
+        ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, false, srcloc));
     } else if ty == types::F64 {
         let sse_op = match lane {
             // Move the lowest quadword in replacement to vector without changing
@@ -256,7 +257,7 @@ fn emit_insert_lane<C: LowerCtx<I = Inst>>(
         // Here we use the `xmm_rm_r` encoding because it correctly tells the register
         // allocator how we are using `dst`: we are using `dst` as a `mod` whereas other
         // encoding formats like `xmm_unary_rm_r` treat it as a `def`.
-        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst));
+        ctx.emit(Inst::xmm_rm_r(sse_op, src, dst, srcloc));
     } else {
         panic!("unable to emit insertlane for type: {}", ty)
     }
@@ -694,6 +695,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 SseOpcode::Pmuludq,
                                 RegMem::reg(lhs.clone()),
                                 rhs_1,
+                                None,
                             ));
 
                             // B' = B
@@ -707,7 +709,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 RegMemImm::imm(32),
                                 lhs_1,
                             ));
-                            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmuludq, RegMem::reg(rhs), lhs_1));
+                            ctx.emit(Inst::xmm_rm_r(
+                                SseOpcode::Pmuludq,
+                                RegMem::reg(rhs),
+                                lhs_1,
+                                None,
+                            ));
 
                             // B' = B' + A'
                             // B' = B' << 32
@@ -715,6 +722,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 SseOpcode::Paddq,
                                 RegMem::reg(rhs_1.to_reg()),
                                 lhs_1,
+                                None,
                             ));
                             ctx.emit(Inst::xmm_rmi_reg(
                                 SseOpcode::Psllq,
@@ -731,11 +739,13 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                                 SseOpcode::Pmuludq,
                                 RegMem::reg(lhs.clone()),
                                 rhs_1,
+                                None,
                             ));
                             ctx.emit(Inst::xmm_rm_r(
                                 SseOpcode::Paddq,
                                 RegMem::reg(lhs_1.to_reg()),
                                 rhs_1,
+                                None,
                             ));
                             ctx.emit(Inst::gen_move(dst, rhs_1.to_reg(), ty));
                             return Ok(());
@@ -770,7 +780,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
-                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
             } else {
                 let is_64 = ty == types::I64;
                 let alu_op = match op {
@@ -828,7 +838,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             // Note the flipping of operands: the `rhs` operand is used as the destination instead
             // of the `lhs` as in the other bit operations above (e.g. `band`).
             ctx.emit(Inst::gen_move(dst, rhs, ty));
-            ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst));
+            ctx.emit(Inst::xmm_rm_r(sse_op, lhs, dst, None));
         }
 
         Opcode::Iabs => {
@@ -884,7 +894,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                 // Move the `lhs` to the same register as `dst`.
                 ctx.emit(Inst::gen_move(dst, lhs, ty));
-                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+                ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
             } else {
                 panic!("Unsupported type for {} instruction: {}", op, ty);
             }
@@ -1007,8 +1017,9 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     SseOpcode::Pxor,
                     RegMem::reg(tmp.to_reg()),
                     tmp,
+                    None,
                 ));
-                ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp));
+                ctx.emit(Inst::xmm_rm_r(subtract_opcode, src, tmp, None));
                 ctx.emit(Inst::xmm_unary_rm_r(
                     SseOpcode::Movapd,
                     RegMem::reg(tmp.to_reg()),
@@ -1561,34 +1572,44 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 };
 
                 match condcode {
-                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst)),
+                    IntCC::Equal => ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None)),
                     IntCC::NotEqual => {
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
                         // Invert the result of the `PCMPEQ*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pxor,
+                            RegMem::from(tmp),
+                            dst,
+                            None,
+                        ));
                     }
                     IntCC::SignedGreaterThan | IntCC::SignedLessThan => {
-                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst))
+                        ctx.emit(Inst::xmm_rm_r(gt(ty), input, dst, None))
                     }
                     IntCC::SignedGreaterThanOrEqual | IntCC::SignedLessThanOrEqual => {
-                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                        ctx.emit(Inst::xmm_rm_r(mins(ty), input.clone(), dst, None));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
                     }
                     IntCC::UnsignedGreaterThan | IntCC::UnsignedLessThan => {
-                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst));
+                        ctx.emit(Inst::xmm_rm_r(maxu(ty), input.clone(), dst, None));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None));
                         // Emit all 1s into the `tmp` register.
                         let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), RegMem::from(tmp), tmp, None));
                         // Invert the result of the `PCMPEQ*`.
-                        ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), dst));
+                        ctx.emit(Inst::xmm_rm_r(
+                            SseOpcode::Pxor,
+                            RegMem::from(tmp),
+                            dst,
+                            None,
+                        ));
                     }
                     IntCC::UnsignedGreaterThanOrEqual | IntCC::UnsignedLessThanOrEqual => {
-                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst));
-                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst))
+                        ctx.emit(Inst::xmm_rm_r(minu(ty), input.clone(), dst, None));
+                        ctx.emit(Inst::xmm_rm_r(eq(ty), input, dst, None))
                     }
                     _ => unimplemented!("Unimplemented comparison code for icmp: {}", condcode),
                 }
@@ -1686,7 +1707,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 ctx.emit(Inst::gen_move(dst, lhs, input_ty));
 
                 // Emit the comparison.
-                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false));
+                ctx.emit(Inst::xmm_rm_r_imm(op, rhs, dst, imm.encode(), false, None));
             }
         }
 
@@ -1899,7 +1920,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ty
                 ),
             };
-            ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst));
+            ctx.emit(Inst::xmm_rm_r(sse_op, rhs, dst, None));
         }
 
         Opcode::Fmin | Opcode::Fmax => {
@@ -1988,15 +2009,15 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
 
                     // Perform min in reverse direction
-                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::from(dst), tmp_xmm1, None));
 
                     // Perform min in original direction
-                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst));
+                    ctx.emit(Inst::xmm_rm_r(min_op, RegMem::reg(lhs), dst, None));
 
                     // X64 handles propagation of -0's and Nans differently between left and right
                     // operands. After doing the min in both directions, this OR will
                     // guarrentee capture of -0's and Nan in our tmp register
-                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::from(dst), tmp_xmm1, None));
 
                     // Compare unordered to create mask for lanes containing NaNs and then use
                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
@@ -2009,8 +2030,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         cond.encode(),
                         false,
+                        None,
+                    ));
+                    ctx.emit(Inst::xmm_rm_r(
+                        or_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
                     ));
-                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
 
                     // The dst register holds a mask for lanes containing NaNs.
                     // We take that mask and shift in preparation for creating a different mask
@@ -2022,7 +2049,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                     // Finally we do a nand with the tmp register to produce the final results
                     // in the dst.
-                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        andn_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        None,
+                    ));
                 } else {
                     let (
                         mov_op,
@@ -2065,23 +2097,43 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(lhs), tmp_xmm1, None));
 
                     // Perform max in reverse direction.
-                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(
+                        max_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
+                    ));
 
                     // Perform max in original direction.
-                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst));
+                    ctx.emit(Inst::xmm_rm_r(max_op, RegMem::reg(lhs), dst, None));
 
                     // Get the difference between the two results and store in tmp.
                     // Max uses a different approach than min to account for potential
                     // discrepancies with plus/minus 0.
-                    ctx.emit(Inst::xmm_rm_r(xor_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        xor_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        None,
+                    ));
 
                     // X64 handles propagation of -0's and Nans differently between left and right
                     // operands. After doing the max in both directions, this OR will
                     // guarentee capture of 0's and Nan in our tmp register.
-                    ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(
+                        or_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
+                    ));
 
                     // Capture NaNs and sign discrepancies.
-                    ctx.emit(Inst::xmm_rm_r(sub_op, RegMem::reg(dst.to_reg()), tmp_xmm1));
+                    ctx.emit(Inst::xmm_rm_r(
+                        sub_op,
+                        RegMem::reg(dst.to_reg()),
+                        tmp_xmm1,
+                        None,
+                    ));
 
                     // Compare unordered to create mask for lanes containing NaNs and then use
                     // that mask to saturate the NaN containing lanes in the tmp register with 1s.
@@ -2092,6 +2144,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         cond.encode(),
                         false,
+                        None,
                     ));
 
                     // The dst register holds a mask for lanes containing NaNs.
@@ -2104,7 +2157,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
                     // Finally we do a nand with the tmp register to produce the final results
                     // in the dst.
-                    ctx.emit(Inst::xmm_rm_r(andn_op, RegMem::reg(tmp_xmm1.to_reg()), dst));
+                    ctx.emit(Inst::xmm_rm_r(
+                        andn_op,
+                        RegMem::reg(tmp_xmm1.to_reg()),
+                        dst,
+                        None,
+                    ));
                 }
             }
         }
@@ -2327,7 +2385,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(inst);
                 }
 
-                ctx.emit(Inst::xmm_rm_r(opcode, src, dst));
+                ctx.emit(Inst::xmm_rm_r(opcode, src, dst, None));
             } else {
                 // Eventually vector constants should be available in `gen_constant` and this block
                 // can be merged with the one above (TODO).
@@ -2348,6 +2406,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         tmp,
                         cond.encode(),
                         false,
+                        None,
                     );
                     ctx.emit(cmpps);
 
@@ -2367,7 +2426,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     ctx.emit(shift);
 
                     // Apply shifted mask (XOR or AND).
-                    let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst);
+                    let mask = Inst::xmm_rm_r(opcode, RegMem::reg(tmp.to_reg()), dst, None);
                     ctx.emit(mask);
                 } else {
                     panic!("unexpected type {:?} for Fabs", output_ty);
@@ -2426,14 +2485,20 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 dst,
                 None,
             ));
-            ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst));
+            ctx.emit(Inst::xmm_rm_r(and_not_op, RegMem::reg(lhs), dst, None));
             ctx.emit(Inst::xmm_mov(mov_op, RegMem::reg(rhs), tmp_xmm2, None));
             ctx.emit(Inst::xmm_rm_r(
                 and_op,
                 RegMem::reg(tmp_xmm1.to_reg()),
                 tmp_xmm2,
+                None,
+            ));
+            ctx.emit(Inst::xmm_rm_r(
+                or_op,
+                RegMem::reg(tmp_xmm2.to_reg()),
+                dst,
+                None,
             ));
-            ctx.emit(Inst::xmm_rm_r(or_op, RegMem::reg(tmp_xmm2.to_reg()), dst));
         }
 
         Opcode::Ceil | Opcode::Floor | Opcode::Nearest | Opcode::Trunc => {
@@ -3154,7 +3219,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 // After loading the constructed mask in a temporary register, we use this to
                 // shuffle the `dst` register (remember that, in this case, it is the same as
                 // `src` so we disregard this register).
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pshufb,
+                    RegMem::from(tmp),
+                    dst,
+                    None,
+                ));
             } else {
                 // If `lhs` and `rhs` are different, we must shuffle each separately and then OR
                 // them together. This is necessary due to PSHUFB semantics. As in the case above,
@@ -3166,7 +3236,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 let constructed_mask = mask.iter().cloned().map(zero_unknown_lane_index).collect();
                 let tmp1 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
                 ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp1, ty));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp1), tmp0));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pshufb,
+                    RegMem::from(tmp1),
+                    tmp0,
+                    None,
+                ));
 
                 // PSHUFB the second argument, placing zeroes for unused lanes.
                 let constructed_mask = mask
@@ -3176,11 +3251,21 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     .collect();
                 let tmp2 = ctx.alloc_tmp(RegClass::V128, types::I8X16);
                 ctx.emit(Inst::xmm_load_const_seq(constructed_mask, tmp2, ty));
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp2), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Pshufb,
+                    RegMem::from(tmp2),
+                    dst,
+                    None,
+                ));
 
                 // OR the shuffled registers (the mechanism and lane-size for OR-ing the registers
                 // is not important).
-                ctx.emit(Inst::xmm_rm_r(SseOpcode::Orps, RegMem::from(tmp0), dst));
+                ctx.emit(Inst::xmm_rm_r(
+                    SseOpcode::Orps,
+                    RegMem::from(tmp0),
+                    dst,
+                    None,
+                ));
 
                 // TODO when AVX512 is enabled we should replace this sequence with a single VPERMB
             }
@@ -3214,6 +3299,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 SseOpcode::Paddusb,
                 RegMem::from(zero_mask),
                 swizzle_mask,
+                None,
             ));
 
             // Shuffle `dst` using the fixed-up `swizzle_mask`.
@@ -3221,6 +3307,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                 SseOpcode::Pshufb,
                 RegMem::from(swizzle_mask),
                 dst,
+                None,
             ));
         }
 
@@ -3240,7 +3327,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             debug_assert!(lane < ty.lane_count() as u8);
 
             ctx.emit(Inst::gen_move(dst, in_vec, ty));
-            emit_insert_lane(ctx, src, dst, lane, ty.lane_type());
+            emit_insert_lane(ctx, src, dst, lane, ty.lane_type(), None);
         }
 
         Opcode::Extractlane => {
@@ -3266,7 +3353,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                     _ => panic!("Unable to extractlane for lane size: {}", ty.lane_bits()),
                 };
                 let src = RegMem::reg(src);
-                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit));
+                ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, lane, w_bit, None));
             } else {
                 if lane == 0 {
                     // Remove the extractlane instruction, leaving the float where it is. The upper
@@ -3288,7 +3375,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         _ => unreachable!(),
                     };
                     let src = RegMem::reg(src);
-                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false));
+                    ctx.emit(Inst::xmm_rm_r_imm(sse_op, src, dst, mask, false, None));
                 }
             }
         }
@@ -3307,16 +3394,26 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             ctx.emit(Inst::xmm_uninit_value(dst));
             match ty.lane_bits() {
                 8 => {
-                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
                     // Initialize a register with all 0s.
                     let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pxor,
+                        RegMem::from(tmp),
+                        tmp,
+                        srcloc,
+                    ));
                     // Shuffle the lowest byte lane to all other lanes.
-                    ctx.emit(Inst::xmm_rm_r(SseOpcode::Pshufb, RegMem::from(tmp), dst))
+                    ctx.emit(Inst::xmm_rm_r(
+                        SseOpcode::Pshufb,
+                        RegMem::from(tmp),
+                        dst,
+                        srcloc,
+                    ))
                 }
                 16 => {
-                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
-                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
                     // Shuffle the lowest two lanes to all other lanes.
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Pshufd,
@@ -3324,10 +3421,11 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         0,
                         false,
+                        srcloc,
                     ))
                 }
                 32 => {
-                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type());
+                    emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);
                     // Shuffle the lowest lane to all other lanes.
                     ctx.emit(Inst::xmm_rm_r_imm(
                         SseOpcode::Pshufd,
@@ -3335,11 +3433,12 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                         dst,
                         0,
                         false,
+                        srcloc,
                     ))
                 }
                 64 => {
-                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type());
-                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type());
+                    emit_insert_lane(ctx, src.clone(), dst, 0, ty.lane_type(), srcloc);
+                    emit_insert_lane(ctx, src, dst, 1, ty.lane_type(), srcloc);
                 }
                 _ => panic!("Invalid type to splat: {}", ty),
             }
@@ -3373,9 +3472,14 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
 
             // Initialize a register with all 0s.
             let tmp = ctx.alloc_tmp(RegClass::V128, ty);
-            ctx.emit(Inst::xmm_rm_r(SseOpcode::Pxor, RegMem::from(tmp), tmp));
+            ctx.emit(Inst::xmm_rm_r(
+                SseOpcode::Pxor,
+                RegMem::from(tmp),
+                tmp,
+                None,
+            ));
             // Compare to see what lanes are filled with all 1s.
-            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp));
+            ctx.emit(Inst::xmm_rm_r(eq(src_ty), src, tmp, None));
             // Set the ZF if the result is all zeroes.
             ctx.emit(Inst::xmm_cmp_rm_r(
                 SseOpcode::Ptest,

From a26e9e9a20be9e22fb6ab60befd25abde7ae46b8 Mon Sep 17 00:00:00 2001
From: Andrew Brown <andrew.brown@intel.com>
Date: Tue, 13 Oct 2020 10:02:50 -0700
Subject: [PATCH 3/3] [machinst x64]: lower load_splat using memory addressing

---
 cranelift/codegen/src/isa/x64/lower.rs | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs
index 576b875515..614589d160 100644
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -3380,18 +3380,30 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
             }
         }
 
-        Opcode::Splat => {
+        Opcode::Splat | Opcode::LoadSplat => {
             let ty = ty.unwrap();
             assert_eq!(ty.bits(), 128);
             let src_ty = ctx.input_ty(insn, 0);
             assert!(src_ty.bits() < 128);
-            let src = input_to_reg_mem(ctx, inputs[0]);
+
+            let (src, srcloc) = match op {
+                Opcode::Splat => (input_to_reg_mem(ctx, inputs[0]), None),
+                Opcode::LoadSplat => {
+                    let offset = ctx.data(insn).load_store_offset().unwrap();
+                    let amode = lower_to_amode(ctx, inputs[0], offset);
+                    (RegMem::mem(amode), Some(ctx.srcloc(insn)))
+                }
+                _ => unreachable!(),
+            };
             let dst = get_output_reg(ctx, outputs[0]);
 
             // We know that splat will overwrite all of the lanes of `dst` but it takes several
             // instructions to do so. Because of the multiple instructions, there is no good way to
             // declare `dst` a `def` except with the following pseudo-instruction.
             ctx.emit(Inst::xmm_uninit_value(dst));
+
+            // TODO: eventually many of these sequences could be optimized with AVX's VBROADCAST*
+            // and VPBROADCAST*.
             match ty.lane_bits() {
                 8 => {
                     emit_insert_lane(ctx, src, dst, 0, ty.lane_type(), srcloc);