x64: Enable load-coalescing for SSE/AVX instructions (#5841)

* x64: Enable load-coalescing for SSE/AVX instructions This commit unlocks the ability to fold loads into operands of SSE and AVX instructions. This is beneficial for both function size when it happens in addition to being able to reduce register pressure. Previously this was not done because most SSE instructions require memory to be aligned. AVX instructions, however, do not have alignment requirements. The solution implemented here is one recommended by Chris which is to add a new `XmmMemAligned` newtype wrapper around `XmmMem`. All SSE instructions are now annotated as requiring an `XmmMemAligned` operand except for a new new instruction styles used specifically for instructions that don't require alignment (e.g. `movdqu`, `*sd`, and `*ss` instructions). All existing instruction helpers continue to take `XmmMem`, however. This way if an AVX lowering is chosen it can be used as-is. If an SSE lowering is chosen, however, then an automatic conversion from `XmmMem` to `XmmMemAligned` kicks in. This automatic conversion only fails for unaligned addresses in which case a load instruction is emitted and the operand becomes a temporary register instead. A number of prior `Xmm` arguments have now been converted to `XmmMem` as well. One change from this commit is that loading an unaligned operand for an SSE instruction previously would use the "correct type" of load, e.g. `movups` for f32x4 or `movup` for f64x2, but now the loading happens in a context without type information so the `movdqu` instruction is generated. According to [this stack overflow question][question] it looks like modern processors won't penalize this "wrong" choice of type when the operand is then used for f32 or f64 oriented instructions. Finally this commit improves some reuse of logic in the `put_in_*_mem*` helper to share code with `sinkable_load` and avoid duplication. With this in place some various ISLE rules have been updated as well. In the tests it can be seen that AVX-instructions are now automatically load-coalesced and use memory operands in a few cases. [question]: https://stackoverflow.com/questions/40854819/is-there-any-situation-where-using-movdqu-and-movupd-is-better-than-movups * Fix tests * Fix move-and-extend to be unaligned These don't have alignment requirements like other xmm instructions as well. Additionally add some ISA tests to ensure that their output is tested. * Review comments
2023-02-21 13:10:19 -06:00
parent c65de1f1b1
commit d82ebcc102
11 changed files with 644 additions and 323 deletions
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -162,7 +162,7 @@
       ;; Arithmetic SIMD shifts.
       (XmmRmiReg (opcode SseOpcode)
                  (src1 Xmm)
-                  (src2 XmmMemImm)
+                  (src2 XmmMemAlignedImm)
                  (dst WritableXmm))
       ;; Integer comparisons/tests: cmp or test (b w l q) (reg addr imm) reg.
@@ -193,7 +193,7 @@
       ;; XMM conditional move; overwrites the destination register.
       (XmmCmove (ty Type)
                 (cc CC)
-                 (consequent XmmMem)
+                 (consequent XmmMemAligned)
                 (alternative Xmm)
                 (dst WritableXmm))
@@ -217,6 +217,12 @@
       ;; XMM (scalar or vector) binary op: (add sub and or xor mul adc? sbb?)
       ;; (32 64) (reg addr) reg
       (XmmRmR (op SseOpcode)
               (src1 Xmm)
               (src2 XmmMemAligned)
               (dst WritableXmm))
       ;; Same as `XmmRmR` except the memory operand can be unaligned
       (XmmRmRUnaligned (op SseOpcode)
                        (src1 Xmm)
                        (src2 XmmMem)
                        (dst WritableXmm))
@@ -235,7 +241,7 @@
       (XmmRmRBlend
         (op SseOpcode)
         (src1 Xmm)
-         (src2 XmmMem)
+         (src2 XmmMemAligned)
         (mask Xmm)
         (dst WritableXmm))
@@ -300,6 +306,12 @@
       ;; not have to be a previously valid value. This is characteristic of mov
       ;; instructions.
       (XmmUnaryRmR (op SseOpcode)
                    (src XmmMemAligned)
                    (dst WritableXmm))
       ;; Same as `XmmUnaryRmR` but used for opcodes where the memory address
       ;; can be unaligned.
       (XmmUnaryRmRUnaligned (op SseOpcode)
                             (src XmmMem)
                             (dst WritableXmm))
@@ -309,7 +321,7 @@
       ;; XmmUnaryRmRImm is not used in the computation of the instruction dst
       ;; value and so does not have to be a previously valid value.
       (XmmUnaryRmRImm (op SseOpcode)
-                       (src XmmMem)
+                       (src XmmMemAligned)
                       (imm u8)
                       (dst WritableXmm))
@@ -380,7 +392,7 @@
       ;; Float comparisons/tests: cmp (b w l q) (reg addr imm) reg.
       (XmmCmpRmR (op SseOpcode)
-                  (src XmmMem)
+                  (src XmmMemAligned)
                  (dst Xmm))
       ;; A binary XMM instruction with an 8-bit immediate: e.g. cmp (ps pd) imm
@@ -1334,7 +1346,9 @@
 (type WritableXmm (primitive WritableXmm))
 (type OptionWritableXmm (primitive OptionWritableXmm))
 (type XmmMem extern (enum))
 (type XmmMemAligned extern (enum))
 (type XmmMemImm extern (enum))
 (type XmmMemAlignedImm extern (enum))
 ;; Convert an `Imm8Reg` into an `Imm8Gpr`.
 (decl imm8_reg_to_imm8_gpr (Imm8Reg) Imm8Gpr)
@@ -1384,6 +1398,25 @@
 (decl xmm_mem_to_xmm_mem_imm (XmmMem) XmmMemImm)
 (extern constructor xmm_mem_to_xmm_mem_imm xmm_mem_to_xmm_mem_imm)
 ;; Convert an `XmmMem` into an `XmmMemAligned`.
 ;;
 ;; Note that this is an infallible conversion, not a fallible one. If the
 ;; original `XmmMem` source is a register, then it's passed through directly.
 ;; If it's `Mem` and refers to aligned memory, it's also passed through
 ;; directly. Otherwise, though, it's a memory source which is not aligned to
 ;; 16 bytes so a load is performed and the temporary register which is the
 ;; result of the load is passed through. The end-result is that the return value
 ;; here is guaranteed to be a register or an aligned memory location.
 (decl xmm_mem_to_xmm_mem_aligned (XmmMem) XmmMemAligned)
 (extern constructor xmm_mem_to_xmm_mem_aligned xmm_mem_to_xmm_mem_aligned)
 ;; Convert an `XmmMemImm` into an `XmmMemImmAligned`.
 ;;
 ;; Note that this is the same as `xmm_mem_to_xmm_mem_aligned` except it handles
 ;; an immediate case as well.
 (decl xmm_mem_imm_to_xmm_mem_aligned_imm (XmmMemImm) XmmMemAlignedImm)
 (extern constructor xmm_mem_imm_to_xmm_mem_aligned_imm xmm_mem_imm_to_xmm_mem_aligned_imm)
 ;; Allocate a new temporary GPR register.
 (decl temp_writable_gpr () WritableGpr)
 (extern constructor temp_writable_gpr temp_writable_gpr)
@@ -1801,23 +1834,19 @@
        dst))
 (rule 2 (x64_load $F32 addr _ext_kind)
-      (xmm_unary_rm_r (SseOpcode.Movss)
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movss) addr))
                      addr))
 (rule 2 (x64_load $F64 addr _ext_kind)
-      (xmm_unary_rm_r (SseOpcode.Movsd)
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movsd) addr))
                      addr))
 (rule 2 (x64_load $F32X4 addr _ext_kind)
-      (xmm_unary_rm_r (SseOpcode.Movups)
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movups) addr))
                      addr))
 (rule 2 (x64_load $F64X2 addr _ext_kind)
-      (xmm_unary_rm_r (SseOpcode.Movupd)
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movupd) addr))
                      addr))
 (rule 0 (x64_load (multi_lane _bits _lanes) addr _ext_kind)
-      (xmm_unary_rm_r (SseOpcode.Movdqu) addr))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movdqu) addr))
 (decl x64_mov (Amode) Reg)
 (rule (x64_mov addr)
@@ -1839,19 +1868,19 @@
 (decl x64_movss_load (XmmMem) Xmm)
 (rule (x64_movss_load from)
-      (xmm_unary_rm_r (SseOpcode.Movss) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movss) from))
 (decl x64_movsd_load (XmmMem) Xmm)
 (rule (x64_movsd_load from)
-      (xmm_unary_rm_r (SseOpcode.Movsd) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movsd) from))
 (decl x64_movups (XmmMem) Xmm)
 (rule (x64_movups from)
-      (xmm_unary_rm_r (SseOpcode.Movups) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movups) from))
 (decl x64_movupd (XmmMem) Xmm)
 (rule (x64_movupd from)
-      (xmm_unary_rm_r (SseOpcode.Movupd) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movupd) from))
 (decl x64_movd (Xmm) Gpr)
 (rule (x64_movd from)
@@ -1859,7 +1888,7 @@
 (decl x64_movdqu (XmmMem) Xmm)
 (rule (x64_movdqu from)
-      (xmm_unary_rm_r (SseOpcode.Movdqu) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Movdqu) from))
 (decl x64_movapd (XmmMem) Xmm)
 (rule (x64_movapd src)
@@ -1867,27 +1896,27 @@
 (decl x64_pmovsxbw (XmmMem) Xmm)
 (rule (x64_pmovsxbw from)
-      (xmm_unary_rm_r (SseOpcode.Pmovsxbw) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Pmovsxbw) from))
 (decl x64_pmovzxbw (XmmMem) Xmm)
 (rule (x64_pmovzxbw from)
-      (xmm_unary_rm_r (SseOpcode.Pmovzxbw) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Pmovzxbw) from))
 (decl x64_pmovsxwd (XmmMem) Xmm)
 (rule (x64_pmovsxwd from)
-      (xmm_unary_rm_r (SseOpcode.Pmovsxwd) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Pmovsxwd) from))
 (decl x64_pmovzxwd (XmmMem) Xmm)
 (rule (x64_pmovzxwd from)
-      (xmm_unary_rm_r (SseOpcode.Pmovzxwd) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Pmovzxwd) from))
 (decl x64_pmovsxdq (XmmMem) Xmm)
 (rule (x64_pmovsxdq from)
-      (xmm_unary_rm_r (SseOpcode.Pmovsxdq) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Pmovsxdq) from))
 (decl x64_pmovzxdq (XmmMem) Xmm)
 (rule (x64_pmovzxdq from)
-      (xmm_unary_rm_r (SseOpcode.Pmovzxdq) from))
+      (xmm_unary_rm_r_unaligned (SseOpcode.Pmovzxdq) from))
 (decl x64_movrm (Type SyntheticAmode Gpr) SideEffectNoResult)
 (rule (x64_movrm ty addr data)
@@ -2178,7 +2207,7 @@
      (cmp_rmi_r size (CmpOpcode.Cmp) (RegMemImm.Imm src1) src2))
 ;; Helper for creating `MInst.XmmCmpRmR` instructions.
-(decl xmm_cmp_rm_r (SseOpcode XmmMem Xmm) ProducesFlags)
+(decl xmm_cmp_rm_r (SseOpcode XmmMemAligned Xmm) ProducesFlags)
 (rule (xmm_cmp_rm_r opcode src1 src2)
      (ProducesFlags.ProducesFlagsSideEffect
       (MInst.XmmCmpRmR opcode src1 src2)))
@@ -2213,7 +2242,7 @@
         (MInst.Cmove size cc consequent alternative dst)
         dst)))
-(decl cmove_xmm (Type CC XmmMem Xmm) ConsumesFlags)
+(decl cmove_xmm (Type CC XmmMemAligned Xmm) ConsumesFlags)
 (rule (cmove_xmm ty cc consequent alternative)
      (let ((dst WritableXmm (temp_writable_xmm)))
        (ConsumesFlags.ConsumesFlagsReturnsReg
@@ -2266,7 +2295,7 @@
         cmove2
         dst)))
-(decl cmove_or_xmm (Type CC CC XmmMem Xmm) ConsumesFlags)
+(decl cmove_or_xmm (Type CC CC XmmMemAligned Xmm) ConsumesFlags)
 (rule (cmove_or_xmm ty cc1 cc2 consequent alternative)
      (let ((dst WritableXmm (temp_writable_xmm))
            (tmp WritableXmm (temp_writable_xmm))
@@ -2324,12 +2353,19 @@
         dst)))
 ;; Helper for creating `MInst.XmmRmR` instructions.
-(decl xmm_rm_r (SseOpcode Xmm XmmMem) Xmm)
+(decl xmm_rm_r (SseOpcode Xmm XmmMemAligned) Xmm)
 (rule (xmm_rm_r op src1 src2)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmRmR op src1 src2 dst))))
        dst))
 ;; Helper for creating `MInst.XmmRmRUnaligned` instructions.
 (decl xmm_rm_r_unaligned (SseOpcode Xmm XmmMem) Xmm)
 (rule (xmm_rm_r_unaligned op src1 src2)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmRmRUnaligned op src1 src2 dst))))
        dst))
 ;; Helper for creating `paddb` instructions.
 (decl x64_paddb (Xmm XmmMem) Xmm)
 (rule 0 (x64_paddb src1 src2)
@@ -2653,12 +2689,12 @@
 ;; Helper for creating `addss` instructions.
 (decl x64_addss (Xmm XmmMem) Xmm)
 (rule (x64_addss src1 src2)
-      (xmm_rm_r (SseOpcode.Addss) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Addss) src1 src2))
 ;; Helper for creating `addsd` instructions.
 (decl x64_addsd (Xmm XmmMem) Xmm)
 (rule (x64_addsd src1 src2)
-      (xmm_rm_r (SseOpcode.Addsd) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Addsd) src1 src2))
 ;; Helper for creating `addps` instructions.
 (decl x64_addps (Xmm XmmMem) Xmm)
@@ -2679,12 +2715,12 @@
 ;; Helper for creating `subss` instructions.
 (decl x64_subss (Xmm XmmMem) Xmm)
 (rule (x64_subss src1 src2)
-      (xmm_rm_r (SseOpcode.Subss) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Subss) src1 src2))
 ;; Helper for creating `subsd` instructions.
 (decl x64_subsd (Xmm XmmMem) Xmm)
 (rule (x64_subsd src1 src2)
-      (xmm_rm_r (SseOpcode.Subsd) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Subsd) src1 src2))
 ;; Helper for creating `subps` instructions.
 (decl x64_subps (Xmm XmmMem) Xmm)
@@ -2705,12 +2741,12 @@
 ;; Helper for creating `mulss` instructions.
 (decl x64_mulss (Xmm XmmMem) Xmm)
 (rule (x64_mulss src1 src2)
-      (xmm_rm_r (SseOpcode.Mulss) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Mulss) src1 src2))
 ;; Helper for creating `mulsd` instructions.
 (decl x64_mulsd (Xmm XmmMem) Xmm)
 (rule (x64_mulsd src1 src2)
-      (xmm_rm_r (SseOpcode.Mulsd) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Mulsd) src1 src2))
 ;; Helper for creating `mulps` instructions.
 (decl x64_mulps (Xmm XmmMem) Xmm)
@@ -2731,12 +2767,12 @@
 ;; Helper for creating `divss` instructions.
 (decl x64_divss (Xmm XmmMem) Xmm)
 (rule (x64_divss src1 src2)
-      (xmm_rm_r (SseOpcode.Divss) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Divss) src1 src2))
 ;; Helper for creating `divsd` instructions.
 (decl x64_divsd (Xmm XmmMem) Xmm)
 (rule (x64_divsd src1 src2)
-      (xmm_rm_r (SseOpcode.Divsd) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Divsd) src1 src2))
 ;; Helper for creating `divps` instructions.
 (decl x64_divps (Xmm XmmMem) Xmm)
@@ -2755,7 +2791,7 @@
      (xmm_rmir_vex (AvxOpcode.Vdivpd) src1 src2))
 ;; Helper for creating `XmmRmRBlend` instructions
-(decl xmm_rm_r_blend (SseOpcode Xmm XmmMem Xmm) Xmm)
+(decl xmm_rm_r_blend (SseOpcode Xmm XmmMemAligned Xmm) Xmm)
 (rule (xmm_rm_r_blend op src1 src2 mask)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmRmRBlend op src1 src2 mask dst))))
@@ -2801,7 +2837,7 @@
 ;; Helper for creating `movsd` instructions.
 (decl x64_movsd_regmove (Xmm XmmMem) Xmm)
 (rule (x64_movsd_regmove src1 src2)
-      (xmm_rm_r (SseOpcode.Movsd) src1 src2))
+      (xmm_rm_r_unaligned (SseOpcode.Movsd) src1 src2))
 ;; Helper for creating `movlhps` instructions.
 (decl x64_movlhps (Xmm XmmMem) Xmm)
@@ -3063,7 +3099,7 @@
        dst))
 ;; Helper for constructing `XmmUnaryRmRImm` instructions.
-(decl xmm_unary_rm_r_imm (SseOpcode XmmMem u8) Xmm)
+(decl xmm_unary_rm_r_imm (SseOpcode XmmMemAligned u8) Xmm)
 (rule (xmm_unary_rm_r_imm op src1 imm)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmUnaryRmRImm op src1 imm dst))))
@@ -3142,12 +3178,19 @@
      (xmm_rmr_imm_vex (AvxOpcode.Vshufps) src1 src2 byte))
 ;; Helper for creating `MInst.XmmUnaryRmR` instructions.
-(decl xmm_unary_rm_r (SseOpcode XmmMem) Xmm)
+(decl xmm_unary_rm_r (SseOpcode XmmMemAligned) Xmm)
 (rule (xmm_unary_rm_r op src)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmUnaryRmR op src dst))))
        dst))
 ;; Helper for creating `MInst.XmmUnaryRmRUnaligned` instructions.
 (decl xmm_unary_rm_r_unaligned (SseOpcode XmmMem) Xmm)
 (rule (xmm_unary_rm_r_unaligned op src)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmUnaryRmRUnaligned op src dst))))
        dst))
 ;; Helper for creating `pabsb` instructions.
 (decl x64_pabsb (XmmMem) Xmm)
 (rule (x64_pabsb src)
@@ -3240,7 +3283,7 @@
      (mul_hi ty $false src1 src2))
 ;; Helper for creating `MInst.XmmRmiXmm` instructions.
-(decl xmm_rmi_xmm (SseOpcode Xmm XmmMemImm) Xmm)
+(decl xmm_rmi_xmm (SseOpcode Xmm XmmMemAlignedImm) Xmm)
 (rule (xmm_rmi_xmm op src1 src2)
      (let ((dst WritableXmm (temp_writable_xmm))
            (_ Unit (emit (MInst.XmmRmiReg op
@@ -3482,22 +3525,17 @@
        dst))
 ;; Helper for creating `minss` instructions.
-(decl x64_minss (Xmm Xmm) Xmm)
+(decl x64_minss (Xmm XmmMem) Xmm)
 (rule (x64_minss x y)
-      (let ((dst WritableXmm (temp_writable_xmm))
+      (xmm_rm_r_unaligned (SseOpcode.Minss) x y))
            (_ Unit (emit (MInst.XmmRmR (SseOpcode.Minss) x y dst))))
        dst))
 ;; Helper for creating `minsd` instructions.
-(decl x64_minsd (Xmm Xmm) Xmm)
+(decl x64_minsd (Xmm XmmMem) Xmm)
 (rule (x64_minsd x y)
-      (let ((dst WritableXmm (temp_writable_xmm))
+      (xmm_rm_r_unaligned (SseOpcode.Minsd) x y))
            (_ Unit (emit (MInst.XmmRmR (SseOpcode.Minsd) x y dst))))
        dst))
 ;; Helper for creating `minps` instructions.
-(decl x64_minps (Xmm Xmm) Xmm)
+(decl x64_minps (Xmm XmmMem) Xmm)
 (rule 0 (x64_minps x y)
      (xmm_rm_r (SseOpcode.Minps) x y))
 (rule 1 (x64_minps x y)
@@ -3505,7 +3543,7 @@
      (xmm_rmir_vex (AvxOpcode.Vminps) x y))
 ;; Helper for creating `minpd` instructions.
-(decl x64_minpd (Xmm Xmm) Xmm)
+(decl x64_minpd (Xmm XmmMem) Xmm)
 (rule 0 (x64_minpd x y)
      (xmm_rm_r (SseOpcode.Minpd) x y))
 (rule 1 (x64_minpd x y)
@@ -3513,17 +3551,17 @@
      (xmm_rmir_vex (AvxOpcode.Vminpd) x y))
 ;; Helper for creating `maxss` instructions.
-(decl x64_maxss (Xmm Xmm) Xmm)
+(decl x64_maxss (Xmm XmmMem) Xmm)
 (rule (x64_maxss x y)
-      (xmm_rm_r (SseOpcode.Maxss) x y))
+      (xmm_rm_r_unaligned (SseOpcode.Maxss) x y))
 ;; Helper for creating `maxsd` instructions.
-(decl x64_maxsd (Xmm Xmm) Xmm)
+(decl x64_maxsd (Xmm XmmMem) Xmm)
 (rule (x64_maxsd x y)
-      (xmm_rm_r (SseOpcode.Maxsd) x y))
+      (xmm_rm_r_unaligned (SseOpcode.Maxsd) x y))
 ;; Helper for creating `maxps` instructions.
-(decl x64_maxps (Xmm Xmm) Xmm)
+(decl x64_maxps (Xmm XmmMem) Xmm)
 (rule 0 (x64_maxps x y)
      (xmm_rm_r (SseOpcode.Maxps) x y))
 (rule 1 (x64_maxps x y)
@@ -3531,7 +3569,7 @@
      (xmm_rmir_vex (AvxOpcode.Vmaxps) x y))
 ;; Helper for creating `maxpd` instructions.
-(decl x64_maxpd (Xmm Xmm) Xmm)
+(decl x64_maxpd (Xmm XmmMem) Xmm)
 (rule 0 (x64_maxpd x y)
      (xmm_rm_r (SseOpcode.Maxpd) x y))
 (rule 1 (x64_maxpd x y)
@@ -3590,19 +3628,19 @@
 ;; Helper for creating `sqrtss` instructions.
-(decl x64_sqrtss (Xmm) Xmm)
+(decl x64_sqrtss (XmmMem) Xmm)
-(rule (x64_sqrtss x) (xmm_unary_rm_r (SseOpcode.Sqrtss) x))
+(rule (x64_sqrtss x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtss) x))
 ;; Helper for creating `sqrtsd` instructions.
-(decl x64_sqrtsd (Xmm) Xmm)
+(decl x64_sqrtsd (XmmMem) Xmm)
-(rule (x64_sqrtsd x) (xmm_unary_rm_r (SseOpcode.Sqrtsd) x))
+(rule (x64_sqrtsd x) (xmm_unary_rm_r_unaligned (SseOpcode.Sqrtsd) x))
 ;; Helper for creating `sqrtps` instructions.
-(decl x64_sqrtps (Xmm) Xmm)
+(decl x64_sqrtps (XmmMem) Xmm)
 (rule (x64_sqrtps x) (xmm_unary_rm_r (SseOpcode.Sqrtps) x))
 ;; Helper for creating `sqrtpd` instructions.
-(decl x64_sqrtpd (Xmm) Xmm)
+(decl x64_sqrtpd (XmmMem) Xmm)
 (rule (x64_sqrtpd x) (xmm_unary_rm_r (SseOpcode.Sqrtpd) x))
 ;; Helper for creating `cvtss2sd` instructions.
@@ -3614,19 +3652,19 @@
 (rule (x64_cvtsd2ss x) (xmm_unary_rm_r (SseOpcode.Cvtsd2ss) x))
 ;; Helper for creating `cvtdq2ps` instructions.
-(decl x64_cvtdq2ps (Xmm) Xmm)
+(decl x64_cvtdq2ps (XmmMem) Xmm)
 (rule (x64_cvtdq2ps x) (xmm_unary_rm_r (SseOpcode.Cvtdq2ps) x))
 ;; Helper for creating `cvtps2pd` instructions.
-(decl x64_cvtps2pd (Xmm) Xmm)
+(decl x64_cvtps2pd (XmmMem) Xmm)
 (rule (x64_cvtps2pd x) (xmm_unary_rm_r (SseOpcode.Cvtps2pd) x))
 ;; Helper for creating `cvtpd2ps` instructions.
-(decl x64_cvtpd2ps (Xmm) Xmm)
+(decl x64_cvtpd2ps (XmmMem) Xmm)
 (rule (x64_cvtpd2ps x) (xmm_unary_rm_r (SseOpcode.Cvtpd2ps) x))
 ;; Helper for creating `cvtdq2pd` instructions.
-(decl x64_cvtdq2pd (Xmm) Xmm)
+(decl x64_cvtdq2pd (XmmMem) Xmm)
 (rule (x64_cvtdq2pd x) (xmm_unary_rm_r (SseOpcode.Cvtdq2pd) x))
 ;; Helper for creating `cvtsi2ss` instructions.
@@ -3640,8 +3678,8 @@
      (gpr_to_xmm (SseOpcode.Cvtsi2sd) x (raw_operand_size_of_type ty)))
 ;; Helper for creating `cvttps2dq` instructions.
-(decl x64_cvttps2dq (Type XmmMem) Xmm)
+(decl x64_cvttps2dq (XmmMem) Xmm)
-(rule (x64_cvttps2dq ty x)
+(rule (x64_cvttps2dq x)
      (xmm_unary_rm_r (SseOpcode.Cvttps2dq) x))
 ;; Helper for creating `cvttpd2dq` instructions.
@@ -4262,6 +4300,7 @@
 (convert Xmm InstOutput output_xmm)
 (convert Value Xmm put_in_xmm)
 (convert Value XmmMem put_in_xmm_mem)
 (convert Value XmmMemAligned put_in_xmm_mem_aligned)
 (convert Value XmmMemImm put_in_xmm_mem_imm)
 (convert Xmm Reg xmm_to_reg)
 (convert Xmm RegMem xmm_to_reg_mem)
@@ -4272,6 +4311,7 @@
 (convert RegMemImm XmmMemImm mov_rmi_to_xmm)
 (convert Xmm XmmMem xmm_to_xmm_mem)
 (convert Xmm XmmMemImm xmm_to_xmm_mem_imm)
 (convert Xmm XmmMemAligned xmm_to_xmm_mem_aligned)
 (convert XmmMem XmmMemImm xmm_mem_to_xmm_mem_imm)
 (convert XmmMem RegMem xmm_mem_to_reg_mem)
 (convert WritableXmm Xmm writable_xmm_to_xmm)
@@ -4280,6 +4320,15 @@
 (convert WritableXmm XmmMem writable_xmm_to_xmm_mem)
 (convert WritableXmm ValueRegs writable_xmm_to_value_regs)
 ;; Note that these conversions will introduce a `movupd` instruction if
 ;; the memory location is not aligned to a 16-byte boundary. This is primarily
 ;; used to convert `XmmMem` inputs, which themselves were typically created
 ;; via the `put_in_xmm_mem` constructor, into operands of SSE instructions.
 ;; Most pre-AVX instructions working with 16-bytes of data (e.g. full xmm
 ;; registers) require 16-byte alignment.
 (convert XmmMem XmmMemAligned xmm_mem_to_xmm_mem_aligned)
 (convert XmmMemImm XmmMemAlignedImm xmm_mem_imm_to_xmm_mem_aligned_imm)
 (convert Gpr Imm8Gpr gpr_to_imm8_gpr)
 (convert Imm8Reg Imm8Gpr imm8_reg_to_imm8_gpr)
@@ -4288,6 +4337,8 @@
 (convert SyntheticAmode GprMem synthetic_amode_to_gpr_mem)
 (convert Amode XmmMem amode_to_xmm_mem)
 (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)
 (convert Amode XmmMemAligned amode_to_xmm_mem_aligned)
 (convert SyntheticAmode XmmMemAligned synthetic_amode_to_xmm_mem_aligned)
 (convert IntCC CC intcc_to_cc)
 (convert AtomicRmwOp MachAtomicRmwOp atomic_rmw_op_to_mach_atomic_rmw_op)
@@ -4337,6 +4388,15 @@
 (decl const_to_synthetic_amode (VCodeConstant) SyntheticAmode)
 (extern constructor const_to_synthetic_amode const_to_synthetic_amode)
 (decl xmm_to_xmm_mem_aligned (Xmm) XmmMemAligned)
 (rule (xmm_to_xmm_mem_aligned reg) (xmm_mem_to_xmm_mem_aligned reg))
 (decl amode_to_xmm_mem_aligned (Amode) XmmMemAligned)
 (rule (amode_to_xmm_mem_aligned mode) (amode_to_xmm_mem mode))
 (decl synthetic_amode_to_xmm_mem_aligned (SyntheticAmode) XmmMemAligned)
 (rule (synthetic_amode_to_xmm_mem_aligned mode) (synthetic_amode_to_xmm_mem mode))
 (decl put_in_xmm_mem_aligned (Value) XmmMemAligned)
 (rule (put_in_xmm_mem_aligned val) (put_in_xmm_mem val))
 ;; Helper for creating `MovFromPReg` instructions.
 (decl mov_from_preg (PReg) Reg)
 (rule (mov_from_preg preg)
--- a/cranelift/codegen/src/isa/x64/inst/args.rs
+++ b/cranelift/codegen/src/isa/x64/inst/args.rs
@@ -31,8 +31,8 @@ macro_rules! newtype_of_reg {
        $newtype_reg:ident,
        $newtype_writable_reg:ident,
        $newtype_option_writable_reg:ident,
-        $newtype_reg_mem:ident,
+        reg_mem: ($($newtype_reg_mem:ident $(aligned:$aligned:ident)?),*),
-        $newtype_reg_mem_imm:ident,
+        reg_mem_imm: ($($newtype_reg_mem_imm:ident $(aligned:$aligned_imm:ident)?),*),
        $newtype_imm8_reg:ident,
        |$check_reg:ident| $check:expr
    ) => {
@@ -102,6 +102,7 @@ macro_rules! newtype_of_reg {
            }
        }
        $(
            /// A newtype wrapper around `RegMem` for general-purpose registers.
            #[derive(Clone, Debug)]
            pub struct $newtype_reg_mem(RegMem);
@@ -124,7 +125,19 @@ macro_rules! newtype_of_reg {
                /// newtype.
                pub fn new(rm: RegMem) -> Option<Self> {
                    match rm {
-                    RegMem::Mem { addr: _ } => Some(Self(rm)),
+                        RegMem::Mem { addr } => {
                            let mut _allow = true;
                            $(
                                if $aligned {
                                    _allow = addr.aligned();
                                }
                            )?
                            if _allow {
                                Some(Self(RegMem::Mem { addr }))
                            } else {
                                None
                            }
                        }
                        RegMem::Reg { reg: $check_reg } if $check => Some(Self(rm)),
                        RegMem::Reg { reg: _ } => None,
                    }
@@ -148,7 +161,9 @@ macro_rules! newtype_of_reg {
                    self.0.pretty_print(size, allocs)
                }
            }
        )*
        $(
            /// A newtype wrapper around `RegMemImm`.
            #[derive(Clone, Debug)]
            pub struct $newtype_reg_mem_imm(RegMemImm);
@@ -165,12 +180,6 @@ macro_rules! newtype_of_reg {
                }
            }
        impl From<$newtype_reg_mem> for $newtype_reg_mem_imm {
            fn from(r: $newtype_reg_mem) -> Self {
                $newtype_reg_mem_imm(r.0.into())
            }
        }
            impl $newtype_reg_mem_imm {
                /// Construct this newtype from the given `RegMemImm`, or return
                /// `None` if the `RegMemImm` is not a valid instance of this
@@ -178,7 +187,19 @@ macro_rules! newtype_of_reg {
                pub fn new(rmi: RegMemImm) -> Option<Self> {
                    match rmi {
                        RegMemImm::Imm { .. } => Some(Self(rmi)),
-                    RegMemImm::Mem { addr: _ } => Some(Self(rmi)),
+                        RegMemImm::Mem { addr } => {
                            let mut _allow = true;
                            $(
                                if $aligned_imm {
                                    _allow = addr.aligned();
                                }
                            )?
                            if _allow {
                                Some(Self(RegMemImm::Mem { addr }))
                            } else {
                                None
                            }
                        }
                        RegMemImm::Reg { reg: $check_reg } if $check => Some(Self(rmi)),
                        RegMemImm::Reg { reg: _ } => None,
                    }
@@ -204,6 +225,7 @@ macro_rules! newtype_of_reg {
                    self.0.pretty_print(size, allocs)
                }
            }
        )*
        /// A newtype wrapper around `Imm8Reg`.
        #[derive(Clone, Debug)]
@@ -242,8 +264,8 @@ newtype_of_reg!(
    Gpr,
    WritableGpr,
    OptionWritableGpr,
-    GprMem,
+    reg_mem: (GprMem),
-    GprMemImm,
+    reg_mem_imm: (GprMemImm),
    Imm8Gpr,
    |reg| reg.class() == RegClass::Int
 );
@@ -253,8 +275,8 @@ newtype_of_reg!(
    Xmm,
    WritableXmm,
    OptionWritableXmm,
-    XmmMem,
+    reg_mem: (XmmMem, XmmMemAligned aligned:true),
-    XmmMemImm,
+    reg_mem_imm: (XmmMemImm, XmmMemAlignedImm aligned:true),
    Imm8Xmm,
    |reg| reg.class() == RegClass::Float
 );
@@ -420,6 +442,10 @@ impl Amode {
        }
        ret
    }
    pub(crate) fn aligned(&self) -> bool {
        self.get_flags().aligned()
    }
 }
 impl PrettyPrint for Amode {
@@ -531,6 +557,13 @@ impl SyntheticAmode {
            }
        }
    }
    pub(crate) fn aligned(&self) -> bool {
        match self {
            SyntheticAmode::Real(addr) => addr.aligned(),
            SyntheticAmode::NominalSPOffset { .. } | SyntheticAmode::ConstantOffset { .. } => true,
        }
    }
 }
 impl Into<SyntheticAmode> for Amode {
--- a/cranelift/codegen/src/isa/x64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit.rs
@@ -1731,7 +1731,21 @@ pub(crate) fn emit(
            sink.bind_label(else_label);
        }
-        Inst::XmmUnaryRmR {
+        Inst::XmmUnaryRmR { op, src, dst } => {
            emit(
                &Inst::XmmUnaryRmRUnaligned {
                    op: *op,
                    src: XmmMem::new(src.clone().into()).unwrap(),
                    dst: *dst,
                },
                allocs,
                sink,
                info,
                state,
            );
        }
        Inst::XmmUnaryRmRUnaligned {
            op,
            src: src_e,
            dst: reg_g,
@@ -1842,6 +1856,24 @@ pub(crate) fn emit(
        }
        Inst::XmmRmR {
            op,
            src1,
            src2,
            dst,
        } => emit(
            &Inst::XmmRmRUnaligned {
                op: *op,
                dst: *dst,
                src1: *src1,
                src2: XmmMem::new(src2.clone().to_reg_mem()).unwrap(),
            },
            allocs,
            sink,
            info,
            state,
        ),
        Inst::XmmRmRUnaligned {
            op,
            src1,
            src2: src_e,
--- a/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/x64/inst/emit_tests.rs
@@ -34,7 +34,7 @@ impl Inst {
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        Inst::XmmUnaryRmRImm {
            op,
-            src: XmmMem::new(src).unwrap(),
+            src: XmmMemAligned::new(src).unwrap(),
            imm,
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
@@ -56,7 +56,7 @@ impl Inst {
        Inst::XmmRmiReg {
            opcode,
            src1: Xmm::new(dst.to_reg()).unwrap(),
-            src2: XmmMemImm::new(src).unwrap(),
+            src2: XmmMemAlignedImm::new(src).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
@@ -96,7 +96,7 @@ impl Inst {
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        Inst::XmmUnaryRmR {
            op,
-            src: XmmMem::new(src).unwrap(),
+            src: XmmMemAligned::new(src).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
@@ -136,7 +136,7 @@ impl Inst {
        Inst::XmmRmRBlend {
            op,
            src1: Xmm::new(dst.to_reg()).unwrap(),
-            src2: XmmMem::new(src2).unwrap(),
+            src2: XmmMemAligned::new(src2).unwrap(),
            mask: Xmm::new(regs::xmm0()).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
--- a/cranelift/codegen/src/isa/x64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/x64/inst/mod.rs
@@ -133,11 +133,13 @@ impl Inst {
            | Inst::XmmMovRM { op, .. }
            | Inst::XmmRmiReg { opcode: op, .. }
            | Inst::XmmRmR { op, .. }
            | Inst::XmmRmRUnaligned { op, .. }
            | Inst::XmmRmRBlend { op, .. }
            | Inst::XmmRmRImm { op, .. }
            | Inst::XmmToGpr { op, .. }
            | Inst::XmmToGprImm { op, .. }
            | Inst::XmmUnaryRmRImm { op, .. }
            | Inst::XmmUnaryRmRUnaligned { op, .. }
            | Inst::XmmUnaryRmR { op, .. }
            | Inst::XmmConstOp { op, .. } => smallvec![op.available_from()],
@@ -293,7 +295,7 @@ impl Inst {
        debug_assert!(dst.to_reg().class() == RegClass::Float);
        Inst::XmmUnaryRmR {
            op,
-            src: XmmMem::new(src).unwrap(),
+            src: XmmMemAligned::new(src).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
@@ -304,7 +306,7 @@ impl Inst {
        Inst::XmmRmR {
            op,
            src1: Xmm::new(dst.to_reg()).unwrap(),
-            src2: XmmMem::new(src).unwrap(),
+            src2: XmmMemAligned::new(src).unwrap(),
            dst: WritableXmm::from_writable_reg(dst).unwrap(),
        }
    }
@@ -369,7 +371,7 @@ impl Inst {
    pub(crate) fn xmm_cmp_rm_r(op: SseOpcode, src: RegMem, dst: Reg) -> Inst {
        src.assert_regclass_is(RegClass::Float);
        debug_assert!(dst.class() == RegClass::Float);
-        let src = XmmMem::new(src).unwrap();
+        let src = XmmMemAligned::new(src).unwrap();
        let dst = Xmm::new(dst).unwrap();
        Inst::XmmCmpRmR { op, src, dst }
    }
@@ -894,6 +896,12 @@ impl PrettyPrint for Inst {
                format!("{} {}, {}", ljustify(op.to_string()), src, dst)
            }
            Inst::XmmUnaryRmRUnaligned { op, src, dst, .. } => {
                let dst = pretty_print_reg(dst.to_reg().to_reg(), op.src_size(), allocs);
                let src = src.pretty_print(op.src_size(), allocs);
                format!("{} {}, {}", ljustify(op.to_string()), src, dst)
            }
            Inst::XmmUnaryRmRImm {
                op, src, dst, imm, ..
            } => {
@@ -927,6 +935,19 @@ impl PrettyPrint for Inst {
                format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst)
            }
            Inst::XmmRmRUnaligned {
                op,
                src1,
                src2,
                dst,
                ..
            } => {
                let src1 = pretty_print_reg(src1.to_reg(), 8, allocs);
                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                let src2 = src2.pretty_print(8, allocs);
                format!("{} {}, {}, {}", ljustify(op.to_string()), src1, src2, dst)
            }
            Inst::XmmConstOp { op, dst } => {
                let dst = pretty_print_reg(dst.to_reg().to_reg(), 8, allocs);
                format!("{} {dst}, {dst}, {dst}", ljustify(op.to_string()))
@@ -1862,9 +1883,11 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            collector.reg_def(dst.to_writable_reg());
            src.get_operands(collector);
        }
-        Inst::XmmUnaryRmR { src, dst, .. }
+        Inst::XmmUnaryRmR { src, dst, .. } | Inst::XmmUnaryRmRImm { src, dst, .. } => {
-        | Inst::XmmUnaryRmREvex { src, dst, .. }
+            collector.reg_def(dst.to_writable_reg());
-        | Inst::XmmUnaryRmRImm { src, dst, .. } => {
+            src.get_operands(collector);
        }
        Inst::XmmUnaryRmREvex { src, dst, .. } | Inst::XmmUnaryRmRUnaligned { src, dst, .. } => {
            collector.reg_def(dst.to_writable_reg());
            src.get_operands(collector);
        }
@@ -1875,6 +1898,13 @@ fn x64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut OperandCol
            collector.reg_reuse_def(dst.to_writable_reg(), 0);
            src2.get_operands(collector);
        }
        Inst::XmmRmRUnaligned {
            src1, src2, dst, ..
        } => {
            collector.reg_use(src1.to_reg());
            collector.reg_reuse_def(dst.to_writable_reg(), 0);
            src2.get_operands(collector);
        }
        Inst::XmmRmRBlend {
            src1,
            src2,
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -2155,10 +2155,6 @@
 ;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; N.B.: there are no load-op merging rules here. We can't guarantee
 ;; the RHS (if a load) is 128-bit aligned, so we must avoid merging a
 ;; load. Likewise for other ops below.
 (rule (lower (has_type $F32 (fadd x y)))
      (x64_addss x y))
 (rule (lower (has_type $F64 (fadd x y)))
@@ -2168,6 +2164,17 @@
 (rule (lower (has_type $F64X2 (fadd x y)))
      (x64_addpd x y))
 ;; The above rules automatically sink loads for rhs operands, so additionally
 ;; add rules for sinking loads with lhs operands.
 (rule 1 (lower (has_type $F32 (fadd (sinkable_load x) y)))
      (x64_addss y (sink_load x)))
 (rule 1 (lower (has_type $F64 (fadd (sinkable_load x) y)))
      (x64_addsd y (sink_load x)))
 (rule 1 (lower (has_type $F32X4 (fadd (sinkable_load x) y)))
      (x64_addps y (sink_load x)))
 (rule 1 (lower (has_type $F64X2 (fadd (sinkable_load x) y)))
      (x64_addpd y (sink_load x)))
 ;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type $F32 (fsub x y)))
@@ -2190,6 +2197,17 @@
 (rule (lower (has_type $F64X2 (fmul x y)))
      (x64_mulpd x y))
 ;; The above rules automatically sink loads for rhs operands, so additionally
 ;; add rules for sinking loads with lhs operands.
 (rule 1 (lower (has_type $F32 (fmul (sinkable_load x) y)))
      (x64_mulss y (sink_load x)))
 (rule 1 (lower (has_type $F64 (fmul (sinkable_load x) y)))
      (x64_mulsd y (sink_load x)))
 (rule 1 (lower (has_type $F32X4 (fmul (sinkable_load x) y)))
      (x64_mulps y (sink_load x)))
 (rule 1 (lower (has_type $F64X2 (fmul (sinkable_load x) y)))
      (x64_mulpd y (sink_load x)))
 ;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 (rule (lower (has_type $F32 (fdiv x y)))
@@ -2983,7 +3001,7 @@
            (tmp Xmm (x64_pxor tmp dst))
            ;; Convert the packed float to packed doubleword.
-            (dst Xmm (x64_cvttps2dq $F32X4 dst))
+            (dst Xmm (x64_cvttps2dq dst))
            ;; Set top bit only if < 0
            (tmp Xmm (x64_pand dst tmp))
@@ -3064,7 +3082,7 @@
            ;; Overflow lanes greater than the maximum allowed signed value will
            ;; set to 0x80000000. Negative and NaN lanes will be 0x0
            (tmp1 Xmm dst)
-            (dst Xmm (x64_cvttps2dq $F32X4 dst))
+            (dst Xmm (x64_cvttps2dq dst))
            ;; Set lanes to src - max_signed_int
            (tmp1 Xmm (x64_subps tmp1 tmp2))
@@ -3074,7 +3092,7 @@
            (tmp2 Xmm (x64_cmpps tmp2 tmp1 (FcmpImm.LessThanOrEqual)))
            ;; Convert those set of lanes that have the max_signed_int factored out.
-            (tmp1 Xmm (x64_cvttps2dq $F32X4 tmp1))
+            (tmp1 Xmm (x64_cvttps2dq tmp1))
            ;; Prepare converted lanes by zeroing negative lanes and prepping lanes
            ;; that have positive overflow (based on the mask) by setting these lanes
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -92,12 +92,6 @@ fn is_mergeable_load(ctx: &mut Lower<Inst>, src_insn: IRInst) -> Option<(InsnInp
        return None;
    }
    // SIMD instructions can only be load-coalesced when the loaded value comes
    // from an aligned address.
    if load_ty.is_vector() && !insn_data.memflags().map_or(false, |f| f.aligned()) {
        return None;
    }
    // Just testing the opcode is enough, because the width will always match if
    // the type does (and the type should match if the CLIF is properly
    // constructed).
--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -100,23 +100,9 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            if let Some(imm) = to_simm32(c as i64) {
                return imm.to_reg_mem_imm();
            }
            // A load from the constant pool is better than a
            // rematerialization into a register, because it reduces
            // register pressure.
            let vcode_constant = self.emit_u64_le_const(c);
            return RegMemImm::mem(SyntheticAmode::ConstantOffset(vcode_constant));
        }
-        if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
+        self.put_in_reg_mem(val).into()
            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
                self.lower_ctx.sink_inst(src_insn);
                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
                return RegMemImm::mem(amode);
            }
        }
        RegMemImm::reg(self.put_in_reg(val))
    }
    fn put_in_xmm_mem_imm(&mut self, val: Value) -> XmmMemImm {
@@ -150,7 +136,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
                .unwrap();
        }
-        XmmMem::new(RegMem::reg(self.put_in_reg(val))).unwrap()
+        XmmMem::new(self.put_in_reg_mem(val)).unwrap()
    }
    fn put_in_reg_mem(&mut self, val: Value) -> RegMem {
@@ -164,12 +150,8 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            return RegMem::mem(SyntheticAmode::ConstantOffset(vcode_constant));
        }
-        if let InputSourceInst::UniqueUse(src_insn, 0) = inputs.inst {
+        if let Some(load) = self.sinkable_load(val) {
-            if let Some((addr_input, offset)) = is_mergeable_load(self.lower_ctx, src_insn) {
+            return self.sink_load(&load);
                self.lower_ctx.sink_inst(src_insn);
                let amode = lower_to_amode(self.lower_ctx, addr_input, offset);
                return RegMem::mem(amode);
            }
        }
        RegMem::reg(self.put_in_reg(val))
@@ -446,7 +428,7 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
    #[inline]
    fn xmm_mem_to_xmm_mem_imm(&mut self, r: &XmmMem) -> XmmMemImm {
-        r.clone().into()
+        XmmMemImm::new(r.clone().to_reg_mem().into()).unwrap()
    }
    #[inline]
@@ -997,10 +979,40 @@ impl Context for IsleContext<'_, '_, MInst, X64Backend> {
            }
        }
    }
    fn xmm_mem_to_xmm_mem_aligned(&mut self, arg: &XmmMem) -> XmmMemAligned {
        match XmmMemAligned::new(arg.clone().into()) {
            Some(aligned) => aligned,
            None => match arg.clone().into() {
                RegMem::Mem { addr } => self.load_xmm_unaligned(addr).into(),
                _ => unreachable!(),
            },
        }
    }
    fn xmm_mem_imm_to_xmm_mem_aligned_imm(&mut self, arg: &XmmMemImm) -> XmmMemAlignedImm {
        match XmmMemAlignedImm::new(arg.clone().into()) {
            Some(aligned) => aligned,
            None => match arg.clone().into() {
                RegMemImm::Mem { addr } => self.load_xmm_unaligned(addr).into(),
                _ => unreachable!(),
            },
        }
    }
 }
 impl IsleContext<'_, '_, MInst, X64Backend> {
    isle_prelude_method_helpers!(X64Caller);
    fn load_xmm_unaligned(&mut self, addr: SyntheticAmode) -> Xmm {
        let tmp = self.lower_ctx.alloc_tmp(types::F32X4).only_reg().unwrap();
        self.lower_ctx.emit(MInst::XmmUnaryRmRUnaligned {
            op: SseOpcode::Movdqu,
            src: XmmMem::new(RegMem::mem(addr)).unwrap(),
            dst: Writable::from_reg(Xmm::new(tmp.to_reg()).unwrap()),
        });
        Xmm::new(tmp.to_reg()).unwrap()
    }
 }
 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
--- a/cranelift/filetests/filetests/isa/x64/fastcall.clif
+++ b/cranelift/filetests/filetests/isa/x64/fastcall.clif
@@ -333,44 +333,42 @@ block0(v0: i64):
 ;   unwind PushFrameRegs { offset_upward_to_caller_sp: 16 }
 ;   movq    %rsp, %rbp
 ;   unwind DefineNewFrame { offset_upward_to_caller_sp: 16, offset_downward_to_clobbers: 160 }
-;   subq    %rsp, $256, %rsp
+;   subq    %rsp, $224, %rsp
-;   movdqu  %xmm6, 96(%rsp)
+;   movdqu  %xmm6, 64(%rsp)
 ;   unwind SaveReg { clobber_offset: 0, reg: p6f }
-;   movdqu  %xmm7, 112(%rsp)
+;   movdqu  %xmm7, 80(%rsp)
 ;   unwind SaveReg { clobber_offset: 16, reg: p7f }
-;   movdqu  %xmm8, 128(%rsp)
+;   movdqu  %xmm8, 96(%rsp)
 ;   unwind SaveReg { clobber_offset: 32, reg: p8f }
-;   movdqu  %xmm9, 144(%rsp)
+;   movdqu  %xmm9, 112(%rsp)
 ;   unwind SaveReg { clobber_offset: 48, reg: p9f }
-;   movdqu  %xmm10, 160(%rsp)
+;   movdqu  %xmm10, 128(%rsp)
 ;   unwind SaveReg { clobber_offset: 64, reg: p10f }
-;   movdqu  %xmm11, 176(%rsp)
+;   movdqu  %xmm11, 144(%rsp)
 ;   unwind SaveReg { clobber_offset: 80, reg: p11f }
-;   movdqu  %xmm12, 192(%rsp)
+;   movdqu  %xmm12, 160(%rsp)
 ;   unwind SaveReg { clobber_offset: 96, reg: p12f }
-;   movdqu  %xmm13, 208(%rsp)
+;   movdqu  %xmm13, 176(%rsp)
 ;   unwind SaveReg { clobber_offset: 112, reg: p13f }
-;   movdqu  %xmm14, 224(%rsp)
+;   movdqu  %xmm14, 192(%rsp)
 ;   unwind SaveReg { clobber_offset: 128, reg: p14f }
-;   movdqu  %xmm15, 240(%rsp)
+;   movdqu  %xmm15, 208(%rsp)
 ;   unwind SaveReg { clobber_offset: 144, reg: p15f }
 ; block0:
 ;   movsd   0(%rcx), %xmm0
 ;   movsd   8(%rcx), %xmm10
-;   movdqu  %xmm10, rsp(80 + virtual offset)
+;   movdqu  %xmm10, rsp(48 + virtual offset)
-;   movsd   16(%rcx), %xmm2
+;   movsd   16(%rcx), %xmm5
 ;   movdqu  %xmm2, rsp(0 + virtual offset)
 ;   movsd   24(%rcx), %xmm14
-;   movdqu  %xmm14, rsp(64 + virtual offset)
+;   movdqu  %xmm14, rsp(32 + virtual offset)
 ;   movsd   32(%rcx), %xmm13
 ;   movsd   40(%rcx), %xmm15
-;   movdqu  %xmm15, rsp(48 + virtual offset)
+;   movdqu  %xmm15, rsp(16 + virtual offset)
 ;   movsd   48(%rcx), %xmm7
-;   movsd   56(%rcx), %xmm5
+;   movsd   56(%rcx), %xmm8
-;   movdqu  %xmm5, rsp(32 + virtual offset)
+;   movdqu  %xmm8, rsp(0 + virtual offset)
 ;   movsd   64(%rcx), %xmm12
-;   movsd   72(%rcx), %xmm4
+;   movsd   72(%rcx), %xmm2
 ;   movdqu  %xmm4, rsp(16 + virtual offset)
 ;   movsd   80(%rcx), %xmm9
 ;   movsd   88(%rcx), %xmm4
 ;   movsd   96(%rcx), %xmm3
@@ -380,24 +378,21 @@ block0(v0: i64):
 ;   movsd   128(%rcx), %xmm6
 ;   movsd   136(%rcx), %xmm14
 ;   movsd   144(%rcx), %xmm1
-;   movsd   152(%rcx), %xmm15
+;   movdqu  rsp(48 + virtual offset), %xmm15
-;   movdqu  rsp(80 + virtual offset), %xmm2
+;   addsd   %xmm0, %xmm15, %xmm0
-;   addsd   %xmm0, %xmm2, %xmm0
+;   movdqu  rsp(32 + virtual offset), %xmm15
-;   movdqu  rsp(0 + virtual offset), %xmm2
+;   addsd   %xmm5, %xmm15, %xmm5
-;   movdqu  rsp(64 + virtual offset), %xmm5
+;   movdqu  rsp(16 + virtual offset), %xmm15
-;   addsd   %xmm2, %xmm5, %xmm2
+;   addsd   %xmm13, %xmm15, %xmm13
-;   movdqu  rsp(48 + virtual offset), %xmm5
+;   movdqu  rsp(0 + virtual offset), %xmm15
-;   addsd   %xmm13, %xmm5, %xmm13
+;   addsd   %xmm7, %xmm15, %xmm7
-;   movdqu  rsp(32 + virtual offset), %xmm5
+;   addsd   %xmm12, %xmm2, %xmm12
 ;   addsd   %xmm7, %xmm5, %xmm7
 ;   movdqu  rsp(16 + virtual offset), %xmm5
 ;   addsd   %xmm12, %xmm5, %xmm12
 ;   addsd   %xmm9, %xmm4, %xmm9
 ;   addsd   %xmm3, %xmm8, %xmm3
 ;   addsd   %xmm11, %xmm10, %xmm11
 ;   addsd   %xmm6, %xmm14, %xmm6
-;   addsd   %xmm1, %xmm15, %xmm1
+;   addsd   %xmm1, 152(%rcx), %xmm1
-;   addsd   %xmm0, %xmm2, %xmm0
+;   addsd   %xmm0, %xmm5, %xmm0
 ;   addsd   %xmm13, %xmm7, %xmm13
 ;   addsd   %xmm12, %xmm9, %xmm12
 ;   addsd   %xmm3, %xmm11, %xmm3
@@ -406,17 +401,17 @@ block0(v0: i64):
 ;   addsd   %xmm12, %xmm3, %xmm12
 ;   addsd   %xmm0, %xmm12, %xmm0
 ;   addsd   %xmm0, %xmm6, %xmm0
-;   movdqu  96(%rsp), %xmm6
+;   movdqu  64(%rsp), %xmm6
-;   movdqu  112(%rsp), %xmm7
+;   movdqu  80(%rsp), %xmm7
-;   movdqu  128(%rsp), %xmm8
+;   movdqu  96(%rsp), %xmm8
-;   movdqu  144(%rsp), %xmm9
+;   movdqu  112(%rsp), %xmm9
-;   movdqu  160(%rsp), %xmm10
+;   movdqu  128(%rsp), %xmm10
-;   movdqu  176(%rsp), %xmm11
+;   movdqu  144(%rsp), %xmm11
-;   movdqu  192(%rsp), %xmm12
+;   movdqu  160(%rsp), %xmm12
-;   movdqu  208(%rsp), %xmm13
+;   movdqu  176(%rsp), %xmm13
-;   movdqu  224(%rsp), %xmm14
+;   movdqu  192(%rsp), %xmm14
-;   movdqu  240(%rsp), %xmm15
+;   movdqu  208(%rsp), %xmm15
-;   addq    %rsp, $256, %rsp
+;   addq    %rsp, $224, %rsp
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -425,34 +420,32 @@ block0(v0: i64):
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
-;   subq $0x100, %rsp
+;   subq $0xe0, %rsp
-;   movdqu %xmm6, 0x60(%rsp)
+;   movdqu %xmm6, 0x40(%rsp)
-;   movdqu %xmm7, 0x70(%rsp)
+;   movdqu %xmm7, 0x50(%rsp)
-;   movdqu %xmm8, 0x80(%rsp)
+;   movdqu %xmm8, 0x60(%rsp)
-;   movdqu %xmm9, 0x90(%rsp)
+;   movdqu %xmm9, 0x70(%rsp)
-;   movdqu %xmm10, 0xa0(%rsp)
+;   movdqu %xmm10, 0x80(%rsp)
-;   movdqu %xmm11, 0xb0(%rsp)
+;   movdqu %xmm11, 0x90(%rsp)
-;   movdqu %xmm12, 0xc0(%rsp)
+;   movdqu %xmm12, 0xa0(%rsp)
-;   movdqu %xmm13, 0xd0(%rsp)
+;   movdqu %xmm13, 0xb0(%rsp)
-;   movdqu %xmm14, 0xe0(%rsp)
+;   movdqu %xmm14, 0xc0(%rsp)
-;   movdqu %xmm15, 0xf0(%rsp)
+;   movdqu %xmm15, 0xd0(%rsp)
-; block1: ; offset 0x67
+; block1: ; offset 0x61
 ;   movsd (%rcx), %xmm0 ; trap: heap_oob
 ;   movsd 8(%rcx), %xmm10 ; trap: heap_oob
-;   movdqu %xmm10, 0x50(%rsp)
+;   movdqu %xmm10, 0x30(%rsp)
-;   movsd 0x10(%rcx), %xmm2 ; trap: heap_oob
+;   movsd 0x10(%rcx), %xmm5 ; trap: heap_oob
 ;   movdqu %xmm2, (%rsp)
 ;   movsd 0x18(%rcx), %xmm14 ; trap: heap_oob
-;   movdqu %xmm14, 0x40(%rsp)
+;   movdqu %xmm14, 0x20(%rsp)
 ;   movsd 0x20(%rcx), %xmm13 ; trap: heap_oob
 ;   movsd 0x28(%rcx), %xmm15 ; trap: heap_oob
-;   movdqu %xmm15, 0x30(%rsp)
+;   movdqu %xmm15, 0x10(%rsp)
 ;   movsd 0x30(%rcx), %xmm7 ; trap: heap_oob
-;   movsd 0x38(%rcx), %xmm5 ; trap: heap_oob
+;   movsd 0x38(%rcx), %xmm8 ; trap: heap_oob
-;   movdqu %xmm5, 0x20(%rsp)
+;   movdqu %xmm8, (%rsp)
 ;   movsd 0x40(%rcx), %xmm12 ; trap: heap_oob
-;   movsd 0x48(%rcx), %xmm4 ; trap: heap_oob
+;   movsd 0x48(%rcx), %xmm2 ; trap: heap_oob
 ;   movdqu %xmm4, 0x10(%rsp)
 ;   movsd 0x50(%rcx), %xmm9 ; trap: heap_oob
 ;   movsd 0x58(%rcx), %xmm4 ; trap: heap_oob
 ;   movsd 0x60(%rcx), %xmm3 ; trap: heap_oob
@@ -462,24 +455,21 @@ block0(v0: i64):
 ;   movsd 0x80(%rcx), %xmm6 ; trap: heap_oob
 ;   movsd 0x88(%rcx), %xmm14 ; trap: heap_oob
 ;   movsd 0x90(%rcx), %xmm1 ; trap: heap_oob
-;   movsd 0x98(%rcx), %xmm15 ; trap: heap_oob
+;   movdqu 0x30(%rsp), %xmm15
-;   movdqu 0x50(%rsp), %xmm2
+;   addsd %xmm15, %xmm0
-;   addsd %xmm2, %xmm0
+;   movdqu 0x20(%rsp), %xmm15
-;   movdqu (%rsp), %xmm2
+;   addsd %xmm15, %xmm5
-;   movdqu 0x40(%rsp), %xmm5
+;   movdqu 0x10(%rsp), %xmm15
-;   addsd %xmm5, %xmm2
+;   addsd %xmm15, %xmm13
-;   movdqu 0x30(%rsp), %xmm5
+;   movdqu (%rsp), %xmm15
-;   addsd %xmm5, %xmm13
+;   addsd %xmm15, %xmm7
-;   movdqu 0x20(%rsp), %xmm5
+;   addsd %xmm2, %xmm12
 ;   addsd %xmm5, %xmm7
 ;   movdqu 0x10(%rsp), %xmm5
 ;   addsd %xmm5, %xmm12
 ;   addsd %xmm4, %xmm9
 ;   addsd %xmm8, %xmm3
 ;   addsd %xmm10, %xmm11
 ;   addsd %xmm14, %xmm6
-;   addsd %xmm15, %xmm1
+;   addsd 0x98(%rcx), %xmm1 ; trap: heap_oob
-;   addsd %xmm2, %xmm0
+;   addsd %xmm5, %xmm0
 ;   addsd %xmm7, %xmm13
 ;   addsd %xmm9, %xmm12
 ;   addsd %xmm11, %xmm3
@@ -488,17 +478,17 @@ block0(v0: i64):
 ;   addsd %xmm3, %xmm12
 ;   addsd %xmm12, %xmm0
 ;   addsd %xmm6, %xmm0
-;   movdqu 0x60(%rsp), %xmm6
+;   movdqu 0x40(%rsp), %xmm6
-;   movdqu 0x70(%rsp), %xmm7
+;   movdqu 0x50(%rsp), %xmm7
-;   movdqu 0x80(%rsp), %xmm8
+;   movdqu 0x60(%rsp), %xmm8
-;   movdqu 0x90(%rsp), %xmm9
+;   movdqu 0x70(%rsp), %xmm9
-;   movdqu 0xa0(%rsp), %xmm10
+;   movdqu 0x80(%rsp), %xmm10
-;   movdqu 0xb0(%rsp), %xmm11
+;   movdqu 0x90(%rsp), %xmm11
-;   movdqu 0xc0(%rsp), %xmm12
+;   movdqu 0xa0(%rsp), %xmm12
-;   movdqu 0xd0(%rsp), %xmm13
+;   movdqu 0xb0(%rsp), %xmm13
-;   movdqu 0xe0(%rsp), %xmm14
+;   movdqu 0xc0(%rsp), %xmm14
-;   movdqu 0xf0(%rsp), %xmm15
+;   movdqu 0xd0(%rsp), %xmm15
-;   addq $0x100, %rsp
+;   addq $0xe0, %rsp
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
--- a/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-bitwise-avx.clif
@@ -13,8 +13,7 @@ block0(v0: f32x4, v1: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movups  0(%rdi), %xmm4
+;   vorps   %xmm0, 0(%rdi), %xmm0
 ;   vorps   %xmm0, %xmm4, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -24,8 +23,7 @@ block0(v0: f32x4, v1: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movups (%rdi), %xmm4
+;   vorps (%rdi), %xmm0, %xmm0
 ;   vorps %xmm4, %xmm0, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -42,12 +40,11 @@ block0(v0: i64):
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
-;   movss   0(%rdi), %xmm7
+;   movl    $-2147483648, %eax
-;   movl    $-2147483648, %ecx
+;   movd    %eax, %xmm4
-;   movd    %ecx, %xmm5
+;   vandnps %xmm4, const(0), %xmm6
-;   vandnps %xmm5, const(0), %xmm8
+;   vandps  %xmm4, 0(%rdi), %xmm8
-;   vandps  %xmm5, %xmm7, %xmm9
+;   vorps   %xmm6, %xmm8, %xmm0
 ;   vorps   %xmm8, %xmm9, %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
@@ -57,12 +54,11 @@ block0(v0: i64):
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
-;   movss (%rdi), %xmm7
+;   movl $0x80000000, %eax
-;   movl $0x80000000, %ecx
+;   movd %eax, %xmm4
-;   movd %ecx, %xmm5
+;   vandnps 0x1b(%rip), %xmm4, %xmm6
-;   vandnps 0x17(%rip), %xmm5, %xmm8
+;   vandps (%rdi), %xmm4, %xmm8
-;   vandps %xmm7, %xmm5, %xmm9
+;   vorps %xmm8, %xmm6, %xmm0
 ;   vorps %xmm9, %xmm8, %xmm0
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
@@ -78,6 +74,8 @@ block0(v0: i64):
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 ;   addb %al, (%rax)
 function %bor_f32x4(f32x4, f32x4) -> f32x4 {
 block0(v0: f32x4, v1: f32x4):
--- a/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif
+++ b/cranelift/filetests/filetests/isa/x64/simd-load-extend.clif
@@ -0,0 +1,154 @@
 test compile precise-output
 set enable_simd
 target x86_64
 function %uload8x8(i64) -> i16x8 {
 block0(v0: i64):
  v1 = uload8x8 v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pmovzxbw 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pmovzxbw (%rdi), %xmm0 ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %sload8x8(i64) -> i16x8 {
 block0(v0: i64):
  v1 = sload8x8 v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pmovsxbw 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pmovsxbw (%rdi), %xmm0 ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %uload16x4(i64) -> i32x4 {
 block0(v0: i64):
  v1 = uload16x4 v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pmovzxwd 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pmovzxwd (%rdi), %xmm0 ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %sload16x4(i64) -> i32x4 {
 block0(v0: i64):
  v1 = sload16x4 v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pmovsxwd 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pmovsxwd (%rdi), %xmm0 ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %uload32x2(i64) -> i64x2 {
 block0(v0: i64):
  v1 = uload32x2 v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pmovzxdq 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pmovzxdq (%rdi), %xmm0 ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq
 function %sload32x2(i64) -> i64x2 {
 block0(v0: i64):
  v1 = sload32x2 v0
  return v1
 }
 ; VCode:
 ;   pushq   %rbp
 ;   movq    %rsp, %rbp
 ; block0:
 ;   pmovsxdq 0(%rdi), %xmm0
 ;   movq    %rbp, %rsp
 ;   popq    %rbp
 ;   ret
 ; 
 ; Disassembled:
 ; block0: ; offset 0x0
 ;   pushq %rbp
 ;   movq %rsp, %rbp
 ; block1: ; offset 0x4
 ;   pmovsxdq (%rdi), %xmm0 ; trap: heap_oob
 ;   movq %rbp, %rsp
 ;   popq %rbp
 ;   retq