x64: port load to ISLE (#3993)

This change moves the majority of the lowerings for CLIF's `load` instruction over to ISLE. To do so, it also migrates the previous mechanism for creating an `Amode` (`lower_to_amode`) to several ISLE rules (see `to_amode`).
2022-04-07 18:31:22 -07:00
parent 76f7cde673
commit f62199da8c
12 changed files with 1726 additions and 806 deletions
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 443b34b797fc8ace
-src/prelude.isle 74d9514ac948e163
+src/prelude.isle c0751050a11e2686
 src/isa/aarch64/inst.isle 19ccefb6a496d392
 src/isa/aarch64/lower.isle d88b62dd6b40622
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
@@ -55,6 +55,7 @@ pub trait Context {
    fn ty_32_or_64(&mut self, arg0: Type) -> Option<Type>;
    fn ty_8_or_16(&mut self, arg0: Type) -> Option<Type>;
    fn ty_int_bool_64(&mut self, arg0: Type) -> Option<Type>;
+    fn ty_int_bool_ref_64(&mut self, arg0: Type) -> Option<Type>;
    fn ty_int_bool_128(&mut self, arg0: Type) -> Option<Type>;
    fn ty_scalar_float(&mut self, arg0: Type) -> Option<Type>;
    fn ty_vec128(&mut self, arg0: Type) -> Option<Type>;
@@ -76,6 +77,7 @@ pub trait Context {
    fn value_type(&mut self, arg0: Value) -> Type;
    fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
    fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
+    fn offset32_to_u32(&mut self, arg0: Offset32) -> u32;
    fn emit(&mut self, arg0: &MInst) -> Unit;
    fn emit_safepoint(&mut self, arg0: &MInst) -> Unit;
    fn trap_code_division_by_zero(&mut self) -> TrapCode;
@@ -130,13 +132,13 @@ pub trait Context {
    fn rotr_opposite_amount(&mut self, arg0: Type, arg1: ImmShift) -> ImmShift;
 }

-/// Internal type SideEffectNoResult: defined at src/prelude.isle line 397.
+/// Internal type SideEffectNoResult: defined at src/prelude.isle line 405.
 #[derive(Clone, Debug)]
 pub enum SideEffectNoResult {
    Inst { inst: MInst },
 }

-/// Internal type ProducesFlags: defined at src/prelude.isle line 419.
+/// Internal type ProducesFlags: defined at src/prelude.isle line 427.
 #[derive(Clone, Debug)]
 pub enum ProducesFlags {
    ProducesFlagsSideEffect { inst: MInst },
@@ -144,7 +146,7 @@ pub enum ProducesFlags {
    ProducesFlagsReturnsResultWithConsumer { inst: MInst, result: Reg },
 }

-/// Internal type ConsumesFlags: defined at src/prelude.isle line 430.
+/// Internal type ConsumesFlags: defined at src/prelude.isle line 438.
 #[derive(Clone, Debug)]
 pub enum ConsumesFlags {
    ConsumesFlagsReturnsResultWithProducer {
@@ -1086,7 +1088,7 @@ pub fn constructor_side_effect<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 402.
+        // Rule at src/prelude.isle line 410.
        let expr0_0 = C::emit(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -1104,7 +1106,7 @@ pub fn constructor_safepoint<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 408.
+        // Rule at src/prelude.isle line 416.
        let expr0_0 = C::emit_safepoint(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -1123,7 +1125,7 @@ pub fn constructor_produces_flags_get_reg<C: Context>(
        result: pattern1_1,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 446.
+        // Rule at src/prelude.isle line 454.
        return Some(pattern1_1);
    }
    return None;
@@ -1140,7 +1142,7 @@ pub fn constructor_produces_flags_ignore<C: Context>(
            inst: ref pattern1_0,
            result: pattern1_1,
        } => {
-            // Rule at src/prelude.isle line 451.
+            // Rule at src/prelude.isle line 459.
            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
                inst: pattern1_0.clone(),
            };
@@ -1150,7 +1152,7 @@ pub fn constructor_produces_flags_ignore<C: Context>(
            inst: ref pattern1_0,
            result: pattern1_1,
        } => {
-            // Rule at src/prelude.isle line 453.
+            // Rule at src/prelude.isle line 461.
            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
                inst: pattern1_0.clone(),
            };
@@ -1179,7 +1181,7 @@ pub fn constructor_consumes_flags_concat<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 460.
+            // Rule at src/prelude.isle line 468.
            let expr0_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
            let expr1_0 = ConsumesFlags::ConsumesFlagsTwiceReturnsValueRegs {
                inst1: pattern1_0.clone(),
@@ -1209,7 +1211,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst: ref pattern3_0,
                    result: pattern3_1,
                } => {
-                    // Rule at src/prelude.isle line 485.
+                    // Rule at src/prelude.isle line 493.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::value_reg(ctx, pattern3_1);
@@ -1220,7 +1222,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst2: ref pattern3_1,
                    result: pattern3_2,
                } => {
-                    // Rule at src/prelude.isle line 491.
+                    // Rule at src/prelude.isle line 499.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::emit(ctx, pattern3_1);
@@ -1233,7 +1235,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst4: ref pattern3_3,
                    result: pattern3_4,
                } => {
-                    // Rule at src/prelude.isle line 503.
+                    // Rule at src/prelude.isle line 511.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::emit(ctx, pattern3_1);
@@ -1254,7 +1256,7 @@ pub fn constructor_with_flags<C: Context>(
                result: pattern3_1,
            } = pattern2_0
            {
-                // Rule at src/prelude.isle line 479.
+                // Rule at src/prelude.isle line 487.
                let expr0_0 = C::emit(ctx, pattern1_0);
                let expr1_0 = C::emit(ctx, pattern3_0);
                let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1274,7 +1276,7 @@ pub fn constructor_with_flags_reg<C: Context>(
 ) -> Option<Reg> {
    let pattern0_0 = arg0;
    let pattern1_0 = arg1;
-    // Rule at src/prelude.isle line 520.
+    // Rule at src/prelude.isle line 528.
    let expr0_0 = constructor_with_flags(ctx, pattern0_0, pattern1_0)?;
    let expr1_0: usize = 0;
    let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
--- a/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 443b34b797fc8ace
-src/prelude.isle 74d9514ac948e163
+src/prelude.isle c0751050a11e2686
 src/isa/s390x/inst.isle d91a16074ab186a8
 src/isa/s390x/lower.isle 1cc5a12adc8c75f9
--- a/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle/generated_code.rs
@@ -55,6 +55,7 @@ pub trait Context {
    fn ty_32_or_64(&mut self, arg0: Type) -> Option<Type>;
    fn ty_8_or_16(&mut self, arg0: Type) -> Option<Type>;
    fn ty_int_bool_64(&mut self, arg0: Type) -> Option<Type>;
+    fn ty_int_bool_ref_64(&mut self, arg0: Type) -> Option<Type>;
    fn ty_int_bool_128(&mut self, arg0: Type) -> Option<Type>;
    fn ty_scalar_float(&mut self, arg0: Type) -> Option<Type>;
    fn ty_vec128(&mut self, arg0: Type) -> Option<Type>;
@@ -76,6 +77,7 @@ pub trait Context {
    fn value_type(&mut self, arg0: Value) -> Type;
    fn multi_lane(&mut self, arg0: Type) -> Option<(u8, u16)>;
    fn def_inst(&mut self, arg0: Value) -> Option<Inst>;
+    fn offset32_to_u32(&mut self, arg0: Offset32) -> u32;
    fn emit(&mut self, arg0: &MInst) -> Unit;
    fn emit_safepoint(&mut self, arg0: &MInst) -> Unit;
    fn trap_code_division_by_zero(&mut self) -> TrapCode;
@@ -153,13 +155,13 @@ pub trait Context {
    fn same_reg(&mut self, arg0: Reg, arg1: WritableReg) -> Option<()>;
 }

-/// Internal type SideEffectNoResult: defined at src/prelude.isle line 397.
+/// Internal type SideEffectNoResult: defined at src/prelude.isle line 405.
 #[derive(Clone, Debug)]
 pub enum SideEffectNoResult {
    Inst { inst: MInst },
 }

-/// Internal type ProducesFlags: defined at src/prelude.isle line 419.
+/// Internal type ProducesFlags: defined at src/prelude.isle line 427.
 #[derive(Clone, Debug)]
 pub enum ProducesFlags {
    ProducesFlagsSideEffect { inst: MInst },
@@ -167,7 +169,7 @@ pub enum ProducesFlags {
    ProducesFlagsReturnsResultWithConsumer { inst: MInst, result: Reg },
 }

-/// Internal type ConsumesFlags: defined at src/prelude.isle line 430.
+/// Internal type ConsumesFlags: defined at src/prelude.isle line 438.
 #[derive(Clone, Debug)]
 pub enum ConsumesFlags {
    ConsumesFlagsReturnsResultWithProducer {
@@ -957,7 +959,7 @@ pub fn constructor_side_effect<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 402.
+        // Rule at src/prelude.isle line 410.
        let expr0_0 = C::emit(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -975,7 +977,7 @@ pub fn constructor_safepoint<C: Context>(
        inst: ref pattern1_0,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 408.
+        // Rule at src/prelude.isle line 416.
        let expr0_0 = C::emit_safepoint(ctx, pattern1_0);
        let expr1_0 = C::output_none(ctx);
        return Some(expr1_0);
@@ -994,7 +996,7 @@ pub fn constructor_produces_flags_get_reg<C: Context>(
        result: pattern1_1,
    } = pattern0_0
    {
-        // Rule at src/prelude.isle line 446.
+        // Rule at src/prelude.isle line 454.
        return Some(pattern1_1);
    }
    return None;
@@ -1011,7 +1013,7 @@ pub fn constructor_produces_flags_ignore<C: Context>(
            inst: ref pattern1_0,
            result: pattern1_1,
        } => {
-            // Rule at src/prelude.isle line 451.
+            // Rule at src/prelude.isle line 459.
            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
                inst: pattern1_0.clone(),
            };
@@ -1021,7 +1023,7 @@ pub fn constructor_produces_flags_ignore<C: Context>(
            inst: ref pattern1_0,
            result: pattern1_1,
        } => {
-            // Rule at src/prelude.isle line 453.
+            // Rule at src/prelude.isle line 461.
            let expr0_0 = ProducesFlags::ProducesFlagsSideEffect {
                inst: pattern1_0.clone(),
            };
@@ -1050,7 +1052,7 @@ pub fn constructor_consumes_flags_concat<C: Context>(
            result: pattern3_1,
        } = pattern2_0
        {
-            // Rule at src/prelude.isle line 460.
+            // Rule at src/prelude.isle line 468.
            let expr0_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
            let expr1_0 = ConsumesFlags::ConsumesFlagsTwiceReturnsValueRegs {
                inst1: pattern1_0.clone(),
@@ -1080,7 +1082,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst: ref pattern3_0,
                    result: pattern3_1,
                } => {
-                    // Rule at src/prelude.isle line 485.
+                    // Rule at src/prelude.isle line 493.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::value_reg(ctx, pattern3_1);
@@ -1091,7 +1093,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst2: ref pattern3_1,
                    result: pattern3_2,
                } => {
-                    // Rule at src/prelude.isle line 491.
+                    // Rule at src/prelude.isle line 499.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::emit(ctx, pattern3_1);
@@ -1104,7 +1106,7 @@ pub fn constructor_with_flags<C: Context>(
                    inst4: ref pattern3_3,
                    result: pattern3_4,
                } => {
-                    // Rule at src/prelude.isle line 503.
+                    // Rule at src/prelude.isle line 511.
                    let expr0_0 = C::emit(ctx, pattern1_0);
                    let expr1_0 = C::emit(ctx, pattern3_0);
                    let expr2_0 = C::emit(ctx, pattern3_1);
@@ -1125,7 +1127,7 @@ pub fn constructor_with_flags<C: Context>(
                result: pattern3_1,
            } = pattern2_0
            {
-                // Rule at src/prelude.isle line 479.
+                // Rule at src/prelude.isle line 487.
                let expr0_0 = C::emit(ctx, pattern1_0);
                let expr1_0 = C::emit(ctx, pattern3_0);
                let expr2_0 = C::value_regs(ctx, pattern1_1, pattern3_1);
@@ -1145,7 +1147,7 @@ pub fn constructor_with_flags_reg<C: Context>(
 ) -> Option<Reg> {
    let pattern0_0 = arg0;
    let pattern1_0 = arg1;
-    // Rule at src/prelude.isle line 520.
+    // Rule at src/prelude.isle line 528.
    let expr0_0 = constructor_with_flags(ctx, pattern0_0, pattern1_0)?;
    let expr1_0: usize = 0;
    let expr2_0 = C::value_regs_get(ctx, expr0_0, expr1_0);
--- a/cranelift/codegen/src/isa/x64/inst.isle
+++ b/cranelift/codegen/src/isa/x64/inst.isle
@@ -748,18 +748,96 @@
 (decl put_in_reg_mem (Value) RegMem)
 (extern constructor put_in_reg_mem put_in_reg_mem)

+;; Addressing modes.
+
 (type SyntheticAmode extern (enum))

 (decl synthetic_amode_to_reg_mem (SyntheticAmode) RegMem)
 (extern constructor synthetic_amode_to_reg_mem synthetic_amode_to_reg_mem)

+(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
+(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)
+
 (type Amode extern (enum))

+(decl amode_with_flags (Amode MemFlags) Amode)
+(extern constructor amode_with_flags amode_with_flags)
+
+(decl amode_imm_reg (u32 Gpr) Amode)
+(extern constructor amode_imm_reg amode_imm_reg)
+
+(decl amode_imm_reg_flags (u32 Gpr MemFlags) Amode)
+(rule (amode_imm_reg_flags offset base flags)
+      (amode_with_flags (amode_imm_reg offset base) flags))
+
 (decl amode_imm_reg_reg_shift (u32 Gpr Gpr u8) Amode)
 (extern constructor amode_imm_reg_reg_shift amode_imm_reg_reg_shift)

-(decl amode_to_synthetic_amode (Amode) SyntheticAmode)
-(extern constructor amode_to_synthetic_amode amode_to_synthetic_amode)
+(decl amode_imm_reg_reg_shift_flags (u32 Gpr Gpr u8 MemFlags) Amode)
+(rule (amode_imm_reg_reg_shift_flags offset base index shift flags)
+      (amode_with_flags (amode_imm_reg_reg_shift offset base index shift) flags))
+
+;; A helper to check if a shift amount (the `Value`) is both constant and
+;; less-than or equal to 3; this is needed since x64 can only shift addresses
+;; using two bits.
+(decl const_shift_lt_eq_3  (u8) Value)
+(extern extractor const_shift_lt_eq_3  const_shift_lt_eq_3 )
+
+;; A helper to both check that the `Imm64` and `Offset32` values sum to less
+;; than 32-bits AND return this summed `u32` value. Also, the `Imm64` will be
+;; zero-extended from `Type` up to 64 bits. This is useful for `to_amode`.
+(decl sum_extend_fits_in_32_bits (Type Imm64 u32) Offset32)
+(extern extractor sum_extend_fits_in_32_bits sum_extend_fits_in_32_bits (in in out))
+
+;; To generate an address for a memory access, we can pattern-match various CLIF
+;; sub-trees to x64's complex addressing modes (`Amode`). In pseudo-code:
+;;
+;; if address matches iadd(a, b):
+;;   if either a or b:
+;;     matches (ishl c with shift amount <= 3):
+;;       amode(base + offset + (c << amount))
+;;     matches (iconst c where c + offset will fit in 32 bits):
+;;       amode(base + eval(c + offset))
+;;     matches (uextend (iconst c) where c + offset will fit in 32 bits):
+;;       amode(base + eval(c + offset))
+;;   else:
+;;     amode(a + offset + (b << 0))
+;; else:
+;;   amode(base + offset)
+;;
+;; The rules for `to_amode` correspond to a subset of the possible addressing
+;; modes available by tweaking the SIB byte, the MOD bits, and the size of the
+;; displacement (i.e., offset). More information is available in Intel's
+;; Software Developer's Manual, volume 2, section 2.1.5, "Addressing-Mode
+;; Encoding of ModR/M and SIB Bytes."
+(decl to_amode (MemFlags Value Offset32) Amode)
+
+;; ...matches (ishl c ...)
+(rule (to_amode flags (iadd (ishl src (const_shift_lt_eq_3 amt)) base) offset)
+      (amode_imm_reg_reg_shift_flags offset (put_in_gpr base) (put_in_gpr src) amt flags))
+(rule (to_amode flags (iadd base (ishl src (const_shift_lt_eq_3 amt))) offset)
+      (amode_imm_reg_reg_shift_flags offset (put_in_gpr base) (put_in_gpr src) amt flags))
+;; ...matches (iconst c ...); note how this matching pattern uses an in-out
+;; extractor to check that the offset and constant value (`c`, the in
+;; parameter), when summed will fit into x64's 32-bit displacement, returned as
+;; `sum` (the out parameter). The syntax for this could be improved (TODO).
+(rule (to_amode flags (iadd (iconst c) base) _offset @ (sum_extend_fits_in_32_bits <$I64 <c sum))
+      (amode_imm_reg_flags sum (put_in_gpr base) flags))
+(rule (to_amode flags (iadd base (iconst c)) _offset @ (sum_extend_fits_in_32_bits <$I64 <c sum))
+      (amode_imm_reg_flags sum (put_in_gpr base) flags))
+;; ...matches (uextend(iconst c) ...); see notes above.
+(rule (to_amode flags (iadd (has_type ty (uextend (iconst c))) base) _offset @ (sum_extend_fits_in_32_bits <ty <c sum))
+      (amode_imm_reg_flags sum (put_in_gpr base) flags))
+(rule (to_amode flags (iadd base (has_type ty (uextend (iconst c)))) _offset @ (sum_extend_fits_in_32_bits <ty <c sum))
+      (amode_imm_reg_flags sum (put_in_gpr base) flags))
+;; ...else only matches (iadd(a b))
+(rule (to_amode flags (iadd base index) offset)
+      (amode_imm_reg_reg_shift_flags offset (put_in_gpr base) (put_in_gpr index) 0 flags))
+;; ...else
+(rule (to_amode flags base offset)
+      (amode_imm_reg_flags offset (put_in_gpr base) flags))
+
+;; Shift kinds.

 (type ShiftKind extern
      (enum ShiftLeft
@@ -1152,11 +1230,11 @@

 ;; Zero extending uses `movzx`.
 (rule (extend (ExtendKind.Zero) ty mode src)
-      (x64_movzx ty mode src))
+      (x64_movzx mode src))

 ;; Sign extending uses `movsx`.
 (rule (extend (ExtendKind.Sign) ty mode src)
-      (x64_movsx ty mode src))
+      (x64_movsx mode src))

 ;;;; Helpers for Working SSE tidbits ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

@@ -1235,8 +1313,7 @@
 (decl x64_load (Type SyntheticAmode ExtKind) Reg)

 (rule (x64_load (fits_in_32 ty) addr (ExtKind.SignExtend))
-      (x64_movsx ty
-             (ext_mode (ty_bytes ty) 8)
+      (x64_movsx (ext_mode (ty_bytes ty) 8)
             addr))

 (rule (x64_load $I64 addr _ext_kind)
@@ -1264,6 +1341,68 @@
      (xmm_unary_rm_r (SseOpcode.Movdqu)
                      addr))

+(decl x64_mov (Amode) Reg)
+(rule (x64_mov addr)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.Mov64MR addr dst))))
+        dst))
+
+(decl x64_movzx (ExtMode GprMem) Gpr)
+(rule (x64_movzx mode src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.MovzxRmR mode src dst))))
+        dst))
+
+(decl x64_movsx (ExtMode GprMem) Gpr)
+(rule (x64_movsx mode src)
+      (let ((dst WritableGpr (temp_writable_gpr))
+            (_ Unit (emit (MInst.MovsxRmR mode src dst))))
+        dst))
+
+(decl x64_movss_load (XmmMem) Xmm)
+(rule (x64_movss_load from)
+      (xmm_unary_rm_r (SseOpcode.Movss) from))
+
+(decl x64_movsd_load (XmmMem) Xmm)
+(rule (x64_movsd_load from)
+      (xmm_unary_rm_r (SseOpcode.Movsd) from))
+
+(decl x64_movups (XmmMem) Xmm)
+(rule (x64_movups from)
+      (xmm_unary_rm_r (SseOpcode.Movups) from))
+
+(decl x64_movupd (XmmMem) Xmm)
+(rule (x64_movupd from)
+      (xmm_unary_rm_r (SseOpcode.Movupd) from))
+
+(decl x64_movdqu (XmmMem) Xmm)
+(rule (x64_movdqu from)
+      (xmm_unary_rm_r (SseOpcode.Movdqu) from))
+
+(decl x64_pmovsxbw (XmmMem) Xmm)
+(rule (x64_pmovsxbw from)
+      (xmm_unary_rm_r (SseOpcode.Pmovsxbw) from))
+
+(decl x64_pmovzxbw (XmmMem) Xmm)
+(rule (x64_pmovzxbw from)
+      (xmm_unary_rm_r (SseOpcode.Pmovzxbw) from))
+
+(decl x64_pmovsxwd (XmmMem) Xmm)
+(rule (x64_pmovsxwd from)
+      (xmm_unary_rm_r (SseOpcode.Pmovsxwd) from))
+
+(decl x64_pmovzxwd (XmmMem) Xmm)
+(rule (x64_pmovzxwd from)
+      (xmm_unary_rm_r (SseOpcode.Pmovzxwd) from))
+
+(decl x64_pmovsxdq (XmmMem) Xmm)
+(rule (x64_pmovsxdq from)
+      (xmm_unary_rm_r (SseOpcode.Pmovsxdq) from))
+
+(decl x64_pmovzxdq (XmmMem) Xmm)
+(rule (x64_pmovzxdq from)
+      (xmm_unary_rm_r (SseOpcode.Pmovzxdq) from))
+
 ;; Load a constant into an XMM register.
 (decl x64_xmm_load_const (Type VCodeConstant) Xmm)
 (rule (x64_xmm_load_const ty const)
@@ -1665,20 +1804,6 @@
         (MInst.Setcc cc dst)
         dst)))

-;; Helper for creating `MInst.MovzxRmR` instructions.
-(decl x64_movzx (Type ExtMode GprMem) Gpr)
-(rule (x64_movzx ty mode src)
-      (let ((dst WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.MovzxRmR mode src dst))))
-        dst))
-
-;; Helper for creating `MInst.MovsxRmR` instructions.
-(decl x64_movsx (Type ExtMode GprMem) Gpr)
-(rule (x64_movsx ty mode src)
-      (let ((dst WritableGpr (temp_writable_gpr))
-            (_ Unit (emit (MInst.MovsxRmR mode src dst))))
-        dst))
-
 ;; Helper for creating `MInst.XmmRmR` instructions.
 (decl xmm_rm_r (Type SseOpcode Xmm XmmMem) Xmm)
 (rule (xmm_rm_r ty op src1 src2)
@@ -1995,8 +2120,8 @@
        (xmm_rm_r $F64X2 (SseOpcode.Blendvpd) src1 src2)))

 ;; Helper for creating `movsd` instructions.
-(decl x64_movsd (Xmm XmmMem) Xmm)
-(rule (x64_movsd src1 src2)
+(decl x64_movsd_regmove (Xmm XmmMem) Xmm)
+(rule (x64_movsd_regmove src1 src2)
      (xmm_rm_r $I8X16 (SseOpcode.Movsd) src1 src2))

 ;; Helper for creating `movlhps` instructions.
@@ -2191,16 +2316,6 @@
            (_ Unit (emit (MInst.XmmUnaryRmR op src dst))))
        dst))

-;; Helper for creating `pmovsxbw` instructions.
-(decl x64_pmovsxbw (XmmMem) Xmm)
-(rule (x64_pmovsxbw src)
-      (xmm_unary_rm_r (SseOpcode.Pmovsxbw) src))
-
-;; Helper for creating `pmovzxbw` instructions.
-(decl x64_pmovzxbw (XmmMem) Xmm)
-(rule (x64_pmovzxbw src)
-      (xmm_unary_rm_r (SseOpcode.Pmovzxbw) src))
-
 ;; Helper for creating `pabsb` instructions.
 (decl x64_pabsb (XmmMem) Xmm)
 (rule (x64_pabsb src)
@@ -2582,7 +2697,9 @@
 (convert Imm8Reg Imm8Gpr imm8_reg_to_imm8_gpr)

 (convert Amode SyntheticAmode amode_to_synthetic_amode)
+(convert Amode GprMem amode_to_gpr_mem)
 (convert SyntheticAmode GprMem synthetic_amode_to_gpr_mem)
+(convert Amode XmmMem amode_to_xmm_mem)
 (convert SyntheticAmode XmmMem synthetic_amode_to_xmm_mem)

 (convert IntCC CC intcc_to_cc)
@@ -2614,8 +2731,14 @@
      (value_reg w_xmm))

 (decl synthetic_amode_to_gpr_mem (SyntheticAmode) GprMem)
+(decl amode_to_gpr_mem (Amode) GprMem)
+(rule (amode_to_gpr_mem amode)
+      (amode_to_synthetic_amode amode))
 (rule (synthetic_amode_to_gpr_mem amode)
      (synthetic_amode_to_reg_mem amode))
+(decl amode_to_xmm_mem (Amode) XmmMem)
+(rule (amode_to_xmm_mem amode)
+      (amode_to_synthetic_amode amode))
 (decl synthetic_amode_to_xmm_mem (SyntheticAmode) XmmMem)
 (rule (synthetic_amode_to_xmm_mem amode)
      (synthetic_amode_to_reg_mem amode))
--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -1349,10 +1349,9 @@
 ;; internally as `xmm_rm_r` will merge the temp register into our `vec`
 ;; register.
 (rule (vec_insert_lane $F64X2 vec (RegMem.Reg val) 0)
-      (x64_movsd vec val))
+      (x64_movsd_regmove vec val))
 (rule (vec_insert_lane $F64X2 vec mem 0)
-      (x64_movsd vec (xmm_unary_rm_r (SseOpcode.Movsd)
-                                 mem)))
+      (x64_movsd_regmove vec (x64_movsd_load mem)))

 ;; f64x2.replace_lane 1
 ;;
@@ -2506,3 +2505,65 @@
      (x64_maxps y x))
 (rule (lower (has_type $F64X2 (fmax_pseudo x y)))
      (x64_maxpd y x))
+
+;; Rules for `load*` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+;; In order to load a value from memory to a GPR register, we may need to extend
+;; the loaded value from 8-, 16-, or 32-bits to this backend's expected GPR
+;; width: 64 bits. Note that `ext_mode` will load 1-bit types (booleans) as
+;; 8-bit loads.
+;;
+;; By default, we zero-extend all sub-64-bit loads to a GPR.
+(rule (lower (has_type (and (fits_in_32 ty) (is_gpr_type _)) (load flags address offset)))
+      (x64_movzx (ext_mode (ty_bits_u16 ty) 64) (to_amode flags address offset)))
+;; But if we know that both the `from` and `to` are 64 bits, we simply load with
+;; no extension.
+(rule (lower (has_type (ty_int_bool_ref_64 ty) (load flags address offset)))
+      (x64_mov (to_amode flags address offset)))
+;; Also, certain scalar loads have a specific `from` width and extension kind
+;; (signed -> `sx`, zeroed -> `zx`). We overwrite the high bits of the 64-bit
+;; GPR even if the `to` type is smaller (e.g., 16-bits).
+(rule (lower (has_type (is_gpr_type ty) (uload8 flags address offset)))
+      (x64_movzx (ExtMode.BQ) (to_amode flags address offset)))
+(rule (lower (has_type (is_gpr_type ty) (sload8 flags address offset)))
+      (x64_movsx (ExtMode.BQ) (to_amode flags address offset)))
+(rule (lower (has_type (is_gpr_type ty) (uload16 flags address offset)))
+      (x64_movzx (ExtMode.WQ) (to_amode flags address offset)))
+(rule (lower (has_type (is_gpr_type ty) (sload16 flags address offset)))
+      (x64_movsx (ExtMode.WQ) (to_amode flags address offset)))
+(rule (lower (has_type (is_gpr_type ty) (uload32 flags address offset)))
+      (x64_movzx (ExtMode.LQ) (to_amode flags address offset)))
+(rule (lower (has_type (is_gpr_type ty) (sload32 flags address offset)))
+      (x64_movsx (ExtMode.LQ) (to_amode flags address offset)))
+
+;; To load to XMM registers, we use the x64-specific instructions for each type.
+;; For `$F32` and `$F64` this is important--we only want to load 32 or 64 bits.
+;; But for the 128-bit types, this is not strictly necessary for performance but
+;; might help with clarity during disassembly.
+(rule (lower (has_type $F32 (load flags address offset)))
+      (x64_movss_load (to_amode flags address offset)))
+(rule (lower (has_type $F64 (load flags address offset)))
+      (x64_movsd_load (to_amode flags address offset)))
+(rule (lower (has_type $F32X4 (load flags address offset)))
+      (x64_movups (to_amode flags address offset)))
+(rule (lower (has_type $F64X2 (load flags address offset)))
+      (x64_movupd (to_amode flags address offset)))
+(rule (lower (has_type (ty_vec128 ty) (load flags address offset)))
+      (x64_movdqu (to_amode flags address offset)))
+
+;; We also include widening vector loads; these sign- or zero-extend each lane
+;; to the next wider width (e.g., 16x4 -> 32x4).
+(rule (lower (has_type $I16X8 (sload8x8 flags address offset)))
+      (x64_pmovsxbw (to_amode flags address offset)))
+(rule (lower (has_type $I16X8 (uload8x8 flags address offset)))
+      (x64_pmovzxbw (to_amode flags address offset)))
+(rule (lower (has_type $I32X4 (sload16x4 flags address offset)))
+      (x64_pmovsxwd (to_amode flags address offset)))
+(rule (lower (has_type $I32X4 (uload16x4 flags address offset)))
+      (x64_pmovzxwd (to_amode flags address offset)))
+(rule (lower (has_type $I64X2 (sload32x2 flags address offset)))
+      (x64_pmovsxdq (to_amode flags address offset)))
+(rule (lower (has_type $I64X2 (uload32x2 flags address offset)))
+      (x64_pmovzxdq (to_amode flags address offset)))
+
+;; TODO: Multi-register loads (I128)
--- a/cranelift/codegen/src/isa/x64/lower.rs
+++ b/cranelift/codegen/src/isa/x64/lower.rs
@@ -2192,18 +2192,6 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                _ => unimplemented!(),
            };

-            let ext_mode = ExtMode::new(elem_ty.bits(), 64);
-
-            let sign_extend = match op {
-                Opcode::Sload8
-                | Opcode::Sload16
-                | Opcode::Sload32
-                | Opcode::Sload8x8
-                | Opcode::Sload16x4
-                | Opcode::Sload32x2 => true,
-                _ => false,
-            };
-
            let amode = match op {
                Opcode::Load
                | Opcode::Uload8
@@ -2229,60 +2217,7 @@ fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::mov64_m_r(amode.clone(), dsts.regs()[0]));
                ctx.emit(Inst::mov64_m_r(amode.offset(8), dsts.regs()[1]));
            } else {
-                let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-                let is_xmm = elem_ty.is_float() || elem_ty.is_vector();
-                match (sign_extend, is_xmm) {
-                    (true, false) => {
-                        // The load is sign-extended only when the output size is lower than 64 bits,
-                        // so ext-mode is defined in this case.
-                        ctx.emit(Inst::movsx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst));
-                    }
-                    (false, false) => {
-                        if elem_ty.bytes() == 8 {
-                            // Use a plain load.
-                            ctx.emit(Inst::mov64_m_r(amode, dst))
-                        } else {
-                            // Use a zero-extended load.
-                            ctx.emit(Inst::movzx_rm_r(ext_mode.unwrap(), RegMem::mem(amode), dst))
-                        }
-                    }
-                    (_, true) => {
-                        ctx.emit(match elem_ty {
-                            types::F32 => Inst::xmm_mov(SseOpcode::Movss, RegMem::mem(amode), dst),
-                            types::F64 => Inst::xmm_mov(SseOpcode::Movsd, RegMem::mem(amode), dst),
-                            types::I8X8 => {
-                                if sign_extend == true {
-                                    Inst::xmm_mov(SseOpcode::Pmovsxbw, RegMem::mem(amode), dst)
-                                } else {
-                                    Inst::xmm_mov(SseOpcode::Pmovzxbw, RegMem::mem(amode), dst)
-                                }
-                            }
-                            types::I16X4 => {
-                                if sign_extend == true {
-                                    Inst::xmm_mov(SseOpcode::Pmovsxwd, RegMem::mem(amode), dst)
-                                } else {
-                                    Inst::xmm_mov(SseOpcode::Pmovzxwd, RegMem::mem(amode), dst)
-                                }
-                            }
-                            types::I32X2 => {
-                                if sign_extend == true {
-                                    Inst::xmm_mov(SseOpcode::Pmovsxdq, RegMem::mem(amode), dst)
-                                } else {
-                                    Inst::xmm_mov(SseOpcode::Pmovzxdq, RegMem::mem(amode), dst)
-                                }
-                            }
-                            _ if elem_ty.is_vector() && elem_ty.bits() == 128 => {
-                                Inst::xmm_mov(SseOpcode::Movups, RegMem::mem(amode), dst)
-                            }
-                            // TODO Specialize for different types: MOVUPD, MOVDQU
-                            _ => unreachable!(
-                                "unexpected type for load: {:?} - {:?}",
-                                elem_ty,
-                                elem_ty.bits()
-                            ),
-                        });
-                    }
-                }
+                implemented_in_isle(ctx);
            }
        }

--- a/cranelift/codegen/src/isa/x64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle.rs
@@ -12,7 +12,7 @@ use crate::{
        condcodes::{FloatCC, IntCC},
        immediates::*,
        types::*,
-        Inst, InstructionData, Opcode, TrapCode, Value, ValueLabel, ValueList,
+        Inst, InstructionData, MemFlags, Opcode, TrapCode, Value, ValueLabel, ValueList,
    },
    isa::{
        settings::Flags,
@@ -313,11 +313,30 @@ where
        Amode::imm_reg_reg_shift(simm32, base, index, shift)
    }

+    #[inline]
+    fn amode_imm_reg(&mut self, simm32: u32, base: Gpr) -> Amode {
+        Amode::imm_reg(simm32, base.to_reg())
+    }
+
+    #[inline]
+    fn amode_with_flags(&mut self, amode: &Amode, flags: MemFlags) -> Amode {
+        amode.with_flags(flags)
+    }
+
    #[inline]
    fn amode_to_synthetic_amode(&mut self, amode: &Amode) -> SyntheticAmode {
        amode.clone().into()
    }

+    #[inline]
+    fn const_shift_lt_eq_3(&mut self, shift_amount: Value) -> Option<u8> {
+        let input = self.lower_ctx.get_value_as_source_or_const(shift_amount);
+        match input.constant {
+            Some(shift_amount) if shift_amount <= 3 => Some(shift_amount as u8),
+            _ => None,
+        }
+    }
+
    #[inline]
    fn writable_gpr_to_reg(&mut self, r: WritableGpr) -> WritableReg {
        r.to_writable_reg()
@@ -519,6 +538,28 @@ where
    fn intcc_to_cc(&mut self, intcc: &IntCC) -> CC {
        CC::from_intcc(*intcc)
    }
+
+    #[inline]
+    fn sum_extend_fits_in_32_bits(
+        &mut self,
+        offset: Offset32,
+        extend_from_ty: Type,
+        constant_value: Imm64,
+    ) -> Option<u32> {
+        let offset: i64 = offset.into();
+        let constant_value: u64 = constant_value.bits() as u64;
+        // If necessary, zero extend `constant_value` up to 64 bits.
+        let shift = 64 - extend_from_ty.bits();
+        let zero_extended_constant_value = (constant_value << shift) >> shift;
+        // Sum up the two operands.
+        let sum = offset.wrapping_add(zero_extended_constant_value as i64);
+        // Check that the sum will fit in 32-bits.
+        if sum == ((sum << 32) >> 32) {
+            Some(sum as u32)
+        } else {
+            None
+        }
+    }
 }

 // Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle 443b34b797fc8ace
-src/prelude.isle 74d9514ac948e163
-src/isa/x64/inst.isle a002d62dcfce285
-src/isa/x64/lower.isle 8f3e1ed2929fd07e
+src/prelude.isle c0751050a11e2686
+src/isa/x64/inst.isle c4729db7808ba0b5
+src/isa/x64/lower.isle 7e839e6b667bfe77
--- a/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/x64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/machinst/isle.rs
+++ b/cranelift/codegen/src/machinst/isle.rs
@@ -245,6 +245,14 @@ macro_rules! isle_prelude_methods {
            }
        }

+        #[inline]
+        fn ty_int_bool_ref_64(&mut self, ty: Type) -> Option<Type> {
+            match ty {
+                I64 | B64 | R64 => Some(ty),
+                _ => None,
+            }
+        }
+
        #[inline]
        fn ty_int_bool_128(&mut self, ty: Type) -> Option<Type> {
            match ty {
@@ -441,6 +449,12 @@ macro_rules! isle_prelude_methods {
        fn lane_type(&mut self, ty: Type) -> Type {
            ty.lane_type()
        }
+
+        #[inline]
+        fn offset32_to_u32(&mut self, offset: Offset32) -> u32 {
+            let offset: i32 = offset.into();
+            offset as u32
+        }
    };
 }

--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -265,6 +265,10 @@
 (decl ty_int_bool_64 (Type) Type)
 (extern extractor ty_int_bool_64 ty_int_bool_64)

+;; An extractor that matches I64 or B64 or R64.
+(decl ty_int_bool_ref_64 (Type) Type)
+(extern extractor ty_int_bool_ref_64 ty_int_bool_ref_64)
+
 ;; An extractor that matches I128 or B128.
 (decl ty_int_bool_128 (Type) Type)
 (extern extractor ty_int_bool_128 ty_int_bool_128)
@@ -379,6 +383,10 @@
 (extractor (u64_from_iconst x)
           (def_inst (iconst (u64_from_imm64 x))))

+;; Convert an `Offset32` to a primitive number.
+(decl offset32_to_u32 (Offset32) u32)
+(extern constructor offset32_to_u32 offset32_to_u32)
+
 ;; Instruction creation helpers ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 ;; Emit an instruction.
@@ -564,3 +572,4 @@
 (convert ValueRegs InstOutput output)
 (convert Reg InstOutput output_reg)
 (convert Value InstOutput output_value)
+(convert Offset32 u32 offset32_to_u32)