aarch64: Use VCodeConstant for f64/v128 constants (#5997)

* aarch64: Translate float and splat lowering to ISLE I was looking into `constant_f128` and its fallback lowering into memory and to get familiar with the code I figured it'd be good to port some Rust logic to ISLE. This commit ports the `constant_{f128,f64,f32}` helpers into ISLE from Rust as well as the `splat_const` helper which ended up being closely related. Tests reflect a number of regalloc changes that happened but also namely one major difference is that in the lowering of `f32` a 32-bit immediate is created now instead of a 64-bit immediate (in a GP register before it's moved into a FP register). This semantically has no change but the generated code is slightly different in a few minor cases. * aarch64: Load f64/v128 constants from a pool This commit removes the `LoadFpuConst64` and `LoadFpuConst128` pseudo-instructions from the AArch64 backend which internally loaded a nearby constant and then jumped over it. Constants now go through the `VCodeConstant` infrastructure which gets placed at the end of the function similar to how x64 works. Some minor support was added in as well to add a new addressing mode for a `MachLabel`-relative load.
2023-03-13 14:33:52 -05:00
parent 6ecdc2482e
commit 03b5dbb3e0
25 changed files with 622 additions and 744 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -466,14 +466,6 @@
        (mem PairAMode)
        (flags MemFlags))

-       (LoadFpuConst64
-        (rd WritableReg)
-        (const_data u64))
-
-       (LoadFpuConst128
-        (rd WritableReg)
-        (const_data u128))
-
       ;; Conversion: FP -> integer.
       (FpuToInt
        (op FpuToIntOp)
@@ -1135,6 +1127,11 @@
         (off i64)
         (ty Type))

+        ;; A reference to a constant which is placed outside of the function's
+        ;; body, typically at the end.
+        (Const
+          (addr VCodeConstant))
+
        ;; Offset from the "nominal stack pointer", which is where the real SP is
        ;; just after stack and spill slots are allocated in the function prologue.
        ;; At emission time, this is converted to `SPOffset` with a fixup added to
@@ -1194,6 +1191,16 @@
 (rule (lane_size (dynamic_lane 32 _)) (ScalarSize.Size32))
 (rule (lane_size (dynamic_lane 64 _)) (ScalarSize.Size64))

+;; Helper for extracting the size of a lane from the input `VectorSize`
+(decl pure vector_lane_size (VectorSize) ScalarSize)
+(rule (vector_lane_size (VectorSize.Size8x16)) (ScalarSize.Size8))
+(rule (vector_lane_size (VectorSize.Size8x8))  (ScalarSize.Size8))
+(rule (vector_lane_size (VectorSize.Size16x8)) (ScalarSize.Size16))
+(rule (vector_lane_size (VectorSize.Size16x4)) (ScalarSize.Size16))
+(rule (vector_lane_size (VectorSize.Size32x4)) (ScalarSize.Size32))
+(rule (vector_lane_size (VectorSize.Size32x2)) (ScalarSize.Size32))
+(rule (vector_lane_size (VectorSize.Size64x2)) (ScalarSize.Size64))
+
 (type Cond extern
  (enum
    (Eq)
@@ -1908,6 +1915,13 @@
            (_ Unit (emit (MInst.VecDupFromFpu dst src size lane))))
        dst))

+;; Helper for emitting `MInst.VecDupImm` instructions.
+(decl vec_dup_imm (ASIMDMovModImm bool VectorSize) Reg)
+(rule (vec_dup_imm imm invert size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecDupImm dst imm invert size))))
+        dst))
+
 ;; Helper for emitting `MInst.AluRRImm12` instructions.
 (decl alu_rr_imm12 (ALUOp Type Reg Imm12) Reg)
 (rule (alu_rr_imm12 op ty src imm)
@@ -2158,6 +2172,13 @@
            (_ Unit (emit (MInst.MovToFpu dst x size))))
        dst))

+;; Helper for emitting `MInst.FpuMoveFPImm` instructions.
+(decl fpu_move_fp_imm (ASIMDFPModImm ScalarSize) Reg)
+(rule (fpu_move_fp_imm imm size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuMoveFPImm dst imm size))))
+        dst))
+
 ;; Helper for emitting `MInst.MovToVec` instructions.
 (decl mov_to_vec (Reg Reg u8 VectorSize) Reg)
 (rule (mov_to_vec src1 src2 lane size)
@@ -2986,24 +3007,122 @@
           (amode ty addr offset)))

 ;; Lower a constant f32.
-(decl constant_f32 (u64) Reg)
-;; TODO: Port lower_constant_f32() to ISLE.
-(extern constructor constant_f32 constant_f32)
+;;
+;; Note that we must make sure that all bits outside the lowest 32 are set to 0
+;; because this function is also used to load wider constants (that have zeros
+;; in their most significant bits).
+(decl constant_f32 (u32) Reg)
+(rule 2 (constant_f32 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
+                     $false
+                     (VectorSize.Size32x2)))
+(rule 1 (constant_f32 n)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size32)))
+        (fpu_move_fp_imm imm (ScalarSize.Size32)))
+(rule (constant_f32 n)
+      (mov_to_fpu (imm $I32 (ImmExtend.Zero) n) (ScalarSize.Size32)))

 ;; Lower a constant f64.
+;;
+;; Note that we must make sure that all bits outside the lowest 64 are set to 0
+;; because this function is also used to load wider constants (that have zeros
+;; in their most significant bits).
+;; TODO: Treat as half of a 128 bit vector and consider replicated patterns.
+;; Scalar MOVI might also be an option.
 (decl constant_f64 (u64) Reg)
-;; TODO: Port lower_constant_f64() to ISLE.
-(extern constructor constant_f64 constant_f64)
+(rule 4 (constant_f64 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size32))
+                     $false
+                     (VectorSize.Size32x2)))
+(rule 3 (constant_f64 n)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (ScalarSize.Size64)))
+        (fpu_move_fp_imm imm (ScalarSize.Size64)))
+(rule 2 (constant_f64 (u64_as_u32 n))
+        (constant_f32 n))
+(rule 1 (constant_f64 (u64_low32_bits_unset n))
+        (mov_to_fpu (imm $I64 (ImmExtend.Zero) n) (ScalarSize.Size64)))
+(rule (constant_f64 n)
+      (fpu_load64 (AMode.Const (emit_u64_le_const n)) (mem_flags_trusted)))
+
+;; Tests whether the low 32 bits in the input are all zero.
+(decl u64_low32_bits_unset (u64) u64)
+(extern extractor u64_low32_bits_unset u64_low32_bits_unset)

 ;; Lower a constant f128.
 (decl constant_f128 (u128) Reg)
-;; TODO: Port lower_constant_f128() to ISLE.
-(extern constructor constant_f128 constant_f128)
+(rule 3 (constant_f128 0)
+        (vec_dup_imm (asimd_mov_mod_imm_zero (ScalarSize.Size8))
+                     $false
+                     (VectorSize.Size8x16)))
+
+;; If the upper 64-bits are all zero then defer to `constant_f64`.
+(rule 2 (constant_f128 (u128_as_u64 n)) (constant_f64 n))
+
+;; If the low half of the u128 equals the high half then delegate to the splat
+;; logic as a splat of a 64-bit value.
+(rule 1 (constant_f128 (u128_replicated_u64 n))
+        (splat_const n (VectorSize.Size64x2)))
+
+;; Base case is to load the constant from memory.
+(rule (constant_f128 n)
+      (fpu_load128 (AMode.Const (emit_u128_le_const n)) (mem_flags_trusted)))

 ;; Lower a vector splat with a constant parameter.
+;;
+;; The 64-bit input here only uses the low bits for the lane size in
+;; `VectorSize` and all other bits are ignored.
 (decl splat_const (u64 VectorSize) Reg)
-;; TODO: Port lower_splat_const() to ISLE.
-(extern constructor splat_const splat_const)
+
+;; If the splat'd constant can itself be reduced in size then attempt to do so
+;; as it will make it easier to create the immediates in the instructions below.
+(rule 5 (splat_const (u64_replicated_u32 n) (VectorSize.Size64x2))
+        (splat_const n (VectorSize.Size32x4)))
+(rule 5 (splat_const (u32_replicated_u16 n) (VectorSize.Size32x4))
+        (splat_const n (VectorSize.Size16x8)))
+(rule 5 (splat_const (u32_replicated_u16 n) (VectorSize.Size32x2))
+        (splat_const n (VectorSize.Size16x4)))
+(rule 5 (splat_const (u16_replicated_u8 n) (VectorSize.Size16x8))
+        (splat_const n (VectorSize.Size8x16)))
+(rule 5 (splat_const (u16_replicated_u8 n) (VectorSize.Size16x4))
+        (splat_const n (VectorSize.Size8x8)))
+
+;; Special cases for `vec_dup_imm` instructions where the input is either
+;; negated or not.
+(rule 4 (splat_const n size)
+        (if-let imm (asimd_mov_mod_imm_from_u64 n (vector_lane_size size)))
+        (vec_dup_imm imm $false size))
+(rule 3 (splat_const n size)
+        (if-let imm (asimd_mov_mod_imm_from_u64 (u64_not n) (vector_lane_size size)))
+        (vec_dup_imm imm $true size))
+
+;; Special case a 32-bit splat where an immediate can be created by
+;; concatenating the 32-bit constant into a 64-bit value
+(rule 2 (splat_const n (VectorSize.Size32x4))
+        (if-let imm (asimd_mov_mod_imm_from_u64 (u64_or n (u64_shl n 32)) (ScalarSize.Size64)))
+        (vec_dup_imm imm $false (VectorSize.Size64x2)))
+(rule 2 (splat_const n (VectorSize.Size32x2))
+        (if-let imm (asimd_mov_mod_imm_from_u64 (u64_or n (u64_shl n 32)) (ScalarSize.Size64)))
+        (fpu_extend (vec_dup_imm imm $false (VectorSize.Size64x2)) (ScalarSize.Size64)))
+
+(rule 1 (splat_const n size)
+        (if-let imm (asimd_fp_mod_imm_from_u64 n (vector_lane_size size)))
+        (vec_dup_fp_imm imm size))
+
+;; The base case for splat is to use `vec_dup` with the immediate loaded into a
+;; register.
+(rule (splat_const n size)
+      (vec_dup (imm $I64 (ImmExtend.Zero) n) size))
+
+;; Each of these extractors tests whether the upper half of the input equals the
+;; lower half of the input
+(decl u128_replicated_u64 (u64) u128)
+(extern extractor u128_replicated_u64 u128_replicated_u64)
+(decl u64_replicated_u32 (u64) u64)
+(extern extractor u64_replicated_u32 u64_replicated_u32)
+(decl u32_replicated_u16 (u64) u64)
+(extern extractor u32_replicated_u16 u32_replicated_u16)
+(decl u16_replicated_u8 (u64) u64)
+(extern extractor u16_replicated_u8 u16_replicated_u8)

 ;; Lower a FloatCC to a Cond.
 (decl fp_cond_code (FloatCC) Cond)
@@ -3814,3 +3933,36 @@
 ;; Helper for emitting the `trn2` instruction
 (decl vec_trn2 (Reg Reg VectorSize) Reg)
 (rule (vec_trn2 rn rm size) (vec_rrr (VecALUOp.Trn2) rn rm size))
+
+;; Helper for creating a zero value `ASIMDMovModImm` immediate.
+(decl asimd_mov_mod_imm_zero (ScalarSize) ASIMDMovModImm)
+(extern constructor asimd_mov_mod_imm_zero asimd_mov_mod_imm_zero)
+
+;; Helper for fallibly creating an `ASIMDMovModImm` immediate from its parts.
+(decl pure partial asimd_mov_mod_imm_from_u64 (u64 ScalarSize) ASIMDMovModImm)
+(extern constructor asimd_mov_mod_imm_from_u64 asimd_mov_mod_imm_from_u64)
+
+;; Helper for fallibly creating an `ASIMDFPModImm` immediate from its parts.
+(decl pure partial asimd_fp_mod_imm_from_u64 (u64 ScalarSize) ASIMDFPModImm)
+(extern constructor asimd_fp_mod_imm_from_u64 asimd_fp_mod_imm_from_u64)
+
+;; Helper for creating a `VecDupFPImm` instruction
+(decl vec_dup_fp_imm (ASIMDFPModImm VectorSize) Reg)
+(rule (vec_dup_fp_imm imm size)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.VecDupFPImm dst imm size))))
+       dst))
+
+;; Helper for creating a `FpuLoad64` instruction
+(decl fpu_load64 (AMode MemFlags) Reg)
+(rule (fpu_load64 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuLoad64 dst amode flags))))
+       dst))
+
+;; Helper for creating a `FpuLoad128` instruction
+(decl fpu_load128 (AMode MemFlags) Reg)
+(rule (fpu_load128 amode flags)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuLoad128 dst amode flags))))
+       dst))
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -124,6 +124,9 @@ pub enum MemLabel {
    /// offset from this instruction. This form must be used at emission time;
    /// see `memlabel_finalize()` for how other forms are lowered to this one.
    PCRel(i32),
+    /// An address that refers to a label within a `MachBuffer`, for example a
+    /// constant that lives in the pool at the end of the function.
+    Mach(MachLabel),
 }

 impl AMode {
@@ -194,6 +197,7 @@ impl AMode {
            | &AMode::FPOffset { .. }
            | &AMode::SPOffset { .. }
            | &AMode::NominalSPOffset { .. }
+            | &AMode::Const { .. }
            | AMode::Label { .. } => self.clone(),
        }
    }
@@ -382,7 +386,8 @@ impl PrettyPrint for ExtendOp {
 impl PrettyPrint for MemLabel {
    fn pretty_print(&self, _: u8, _: &mut AllocationConsumer<'_>) -> String {
        match self {
-            &MemLabel::PCRel(off) => format!("pc+{}", off),
+            MemLabel::PCRel(off) => format!("pc+{}", off),
+            MemLabel::Mach(off) => format!("label({})", off.get()),
        }
    }
 }
@@ -465,6 +470,8 @@ impl PrettyPrint for AMode {
                let simm9 = simm9.pretty_print(8, allocs);
                format!("[sp], {}", simm9)
            }
+            AMode::Const { addr } => format!("[const({})]", addr.as_u32()),
+
            // Eliminated by `mem_finalize()`.
            &AMode::SPOffset { .. }
            | &AMode::FPOffset { .. }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2,7 +2,7 @@

 use regalloc2::Allocation;

-use crate::binemit::{CodeOffset, Reloc, StackMap};
+use crate::binemit::{Reloc, StackMap};
 use crate::ir::{types::*, RelSourceLoc};
 use crate::ir::{LibCall, MemFlags, TrapCode};
 use crate::isa::aarch64::inst::*;
@@ -10,20 +10,12 @@ use crate::machinst::{ty_bits, Reg, RegClass, Writable};
 use crate::trace;
 use core::convert::TryFrom;

-/// Memory label/reference finalization: convert a MemLabel to a PC-relative
-/// offset, possibly emitting relocation(s) as necessary.
-pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
-    match label {
-        &MemLabel::PCRel(rel) => rel,
-    }
-}
-
 /// Memory addressing mode finalization: convert "special" modes (e.g.,
 /// generic arbitrary stack offset) into real addressing modes, possibly by
 /// emitting some helper instructions that come immediately before the use
 /// of this amode.
 pub fn mem_finalize(
-    insn_off: CodeOffset,
+    sink: Option<&mut MachBuffer<Inst>>,
    mem: &AMode,
    state: &EmitState,
 ) -> (SmallVec<[Inst; 4]>, AMode) {
@@ -74,14 +66,14 @@ pub fn mem_finalize(
            }
        }

-        &AMode::Label { ref label } => {
-            let off = memlabel_finalize(insn_off, label);
-            (
-                smallvec![],
-                AMode::Label {
-                    label: MemLabel::PCRel(off),
-                },
-            )
+        AMode::Const { addr } => {
+            let sink = match sink {
+                Some(sink) => sink,
+                None => return (smallvec![], mem.clone()),
+            };
+            let label = sink.get_label_for_constant(*addr);
+            let label = MemLabel::Mach(label);
+            (smallvec![], AMode::Label { label })
        }

        _ => (smallvec![], mem.clone()),
@@ -959,7 +951,7 @@ impl MachInstEmit for Inst {
            | &Inst::FpuLoad128 { rd, ref mem, flags } => {
                let rd = allocs.next_writable(rd);
                let mem = mem.with_allocs(&mut allocs);
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), &mem, state);
+                let (mem_insts, mem) = mem_finalize(Some(sink), &mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(&[], sink, emit_info, state);
@@ -1039,7 +1031,19 @@ impl MachInstEmit for Inst {
                    &AMode::Label { ref label } => {
                        let offset = match label {
                            // cast i32 to u32 (two's-complement)
-                            &MemLabel::PCRel(off) => off as u32,
+                            MemLabel::PCRel(off) => *off as u32,
+                            // Emit a relocation into the `MachBuffer`
+                            // for the label that's being loaded from and
+                            // encode an address of 0 in its place which will
+                            // get filled in by relocation resolution later on.
+                            MemLabel::Mach(label) => {
+                                sink.use_label_at_offset(
+                                    sink.cur_offset(),
+                                    *label,
+                                    LabelUse::Ldr19,
+                                );
+                                0
+                            }
                        } / 4;
                        assert!(offset < (1 << 19));
                        match self {
@@ -1076,6 +1080,7 @@ impl MachInstEmit for Inst {
                    &AMode::SPOffset { .. }
                    | &AMode::FPOffset { .. }
                    | &AMode::NominalSPOffset { .. }
+                    | &AMode::Const { .. }
                    | &AMode::RegOffset { .. } => {
                        panic!("Should not see {:?} here!", mem)
                    }
@@ -1091,7 +1096,7 @@ impl MachInstEmit for Inst {
            | &Inst::FpuStore128 { rd, ref mem, flags } => {
                let rd = allocs.next(rd);
                let mem = mem.with_allocs(&mut allocs);
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), &mem, state);
+                let (mem_insts, mem) = mem_finalize(Some(sink), &mem, state);

                for inst in mem_insts.into_iter() {
                    inst.emit(&[], sink, emit_info, state);
@@ -1172,6 +1177,7 @@ impl MachInstEmit for Inst {
                    &AMode::SPOffset { .. }
                    | &AMode::FPOffset { .. }
                    | &AMode::NominalSPOffset { .. }
+                    | &AMode::Const { .. }
                    | &AMode::RegOffset { .. } => {
                        panic!("Should not see {:?} here!", mem)
                    }
@@ -2319,41 +2325,6 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_inttofpu(top16, rd, rn));
            }
-            &Inst::LoadFpuConst64 { rd, const_data } => {
-                let rd = allocs.next_writable(rd);
-                let inst = Inst::FpuLoad64 {
-                    rd,
-                    mem: AMode::Label {
-                        label: MemLabel::PCRel(8),
-                    },
-                    flags: MemFlags::trusted(),
-                };
-                inst.emit(&[], sink, emit_info, state);
-                let inst = Inst::Jump {
-                    dest: BranchTarget::ResolvedOffset(12),
-                };
-                inst.emit(&[], sink, emit_info, state);
-                sink.put8(const_data);
-            }
-            &Inst::LoadFpuConst128 { rd, const_data } => {
-                let rd = allocs.next_writable(rd);
-                let inst = Inst::FpuLoad128 {
-                    rd,
-                    mem: AMode::Label {
-                        label: MemLabel::PCRel(8),
-                    },
-                    flags: MemFlags::trusted(),
-                };
-                inst.emit(&[], sink, emit_info, state);
-                let inst = Inst::Jump {
-                    dest: BranchTarget::ResolvedOffset(20),
-                };
-                inst.emit(&[], sink, emit_info, state);
-
-                for i in const_data.to_le_bytes().iter() {
-                    sink.put1(*i);
-                }
-            }
            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
                let rd = allocs.next_writable(rd);
                let rn = allocs.next(rn);
@@ -3350,7 +3321,7 @@ impl MachInstEmit for Inst {
            &Inst::LoadAddr { rd, ref mem } => {
                let rd = allocs.next_writable(rd);
                let mem = mem.with_allocs(&mut allocs);
-                let (mem_insts, mem) = mem_finalize(sink.cur_offset(), &mem, state);
+                let (mem_insts, mem) = mem_finalize(Some(sink), &mem, state);
                for inst in mem_insts.into_iter() {
                    inst.emit(&[], sink, emit_info, state);
                }
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -6891,24 +6891,6 @@ fn test_aarch64_binemit() {
        "stp q18, q22, [sp], #304",
    ));

-    insns.push((
-        Inst::LoadFpuConst64 {
-            rd: writable_vreg(16),
-            const_data: 1.0_f64.to_bits(),
-        },
-        "5000005C03000014000000000000F03F",
-        "ldr d16, pc+8 ; b 12 ; data.f64 1",
-    ));
-
-    insns.push((
-        Inst::LoadFpuConst128 {
-            rd: writable_vreg(5),
-            const_data: 0x0f0e0d0c0b0a09080706050403020100,
-        },
-        "4500009C05000014000102030405060708090A0B0C0D0E0F",
-        "ldr q5, pc+8 ; b 20 ; data.f128 0x0f0e0d0c0b0a09080706050403020100",
-    ));
-
    insns.push((
        Inst::FpuCSel32 {
            rd: writable_vreg(1),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -10,7 +10,6 @@ use crate::{settings, CodegenError, CodegenResult};
 use crate::machinst::{PrettyPrint, Reg, RegClass, Writable};

 use alloc::vec::Vec;
-use core::convert::TryFrom;
 use regalloc2::{PRegSet, VReg};
 use smallvec::{smallvec, SmallVec};
 use std::string::{String, ToString};
@@ -250,215 +249,6 @@ impl Inst {
        }
    }

-    /// Create instructions that load a 32-bit floating-point constant.
-    pub fn load_fp_constant32<F: FnMut(Type) -> Writable<Reg>>(
-        rd: Writable<Reg>,
-        const_data: u32,
-        mut alloc_tmp: F,
-    ) -> SmallVec<[Inst; 4]> {
-        // Note that we must make sure that all bits outside the lowest 32 are set to 0
-        // because this function is also used to load wider constants (that have zeros
-        // in their most significant bits).
-        if const_data == 0 {
-            smallvec![Inst::VecDupImm {
-                rd,
-                imm: ASIMDMovModImm::zero(ScalarSize::Size32),
-                invert: false,
-                size: VectorSize::Size32x2,
-            }]
-        } else if let Some(imm) =
-            ASIMDFPModImm::maybe_from_u64(const_data.into(), ScalarSize::Size32)
-        {
-            smallvec![Inst::FpuMoveFPImm {
-                rd,
-                imm,
-                size: ScalarSize::Size32,
-            }]
-        } else {
-            let tmp = alloc_tmp(I32);
-            let mut insts = Inst::load_constant(tmp, const_data as u64, &mut alloc_tmp);
-
-            insts.push(Inst::MovToFpu {
-                rd,
-                rn: tmp.to_reg(),
-                size: ScalarSize::Size32,
-            });
-
-            insts
-        }
-    }
-
-    /// Create instructions that load a 64-bit floating-point constant.
-    pub fn load_fp_constant64<F: FnMut(Type) -> Writable<Reg>>(
-        rd: Writable<Reg>,
-        const_data: u64,
-        mut alloc_tmp: F,
-    ) -> SmallVec<[Inst; 4]> {
-        // Note that we must make sure that all bits outside the lowest 64 are set to 0
-        // because this function is also used to load wider constants (that have zeros
-        // in their most significant bits).
-        // TODO: Treat as half of a 128 bit vector and consider replicated patterns.
-        // Scalar MOVI might also be an option.
-        if const_data == 0 {
-            smallvec![Inst::VecDupImm {
-                rd,
-                imm: ASIMDMovModImm::zero(ScalarSize::Size32),
-                invert: false,
-                size: VectorSize::Size32x2,
-            }]
-        } else if let Some(imm) = ASIMDFPModImm::maybe_from_u64(const_data, ScalarSize::Size64) {
-            smallvec![Inst::FpuMoveFPImm {
-                rd,
-                imm,
-                size: ScalarSize::Size64,
-            }]
-        } else if let Ok(const_data) = u32::try_from(const_data) {
-            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
-        } else if const_data & (u32::MAX as u64) == 0 {
-            let tmp = alloc_tmp(I64);
-            let mut insts = Inst::load_constant(tmp, const_data, &mut alloc_tmp);
-
-            insts.push(Inst::MovToFpu {
-                rd,
-                rn: tmp.to_reg(),
-                size: ScalarSize::Size64,
-            });
-
-            insts
-        } else {
-            smallvec![Inst::LoadFpuConst64 { rd, const_data }]
-        }
-    }
-
-    /// Create instructions that load a 128-bit vector constant.
-    pub fn load_fp_constant128<F: FnMut(Type) -> Writable<Reg>>(
-        rd: Writable<Reg>,
-        const_data: u128,
-        alloc_tmp: F,
-    ) -> SmallVec<[Inst; 5]> {
-        if let Ok(const_data) = u64::try_from(const_data) {
-            SmallVec::from(&Inst::load_fp_constant64(rd, const_data, alloc_tmp)[..])
-        } else if let Some((pattern, size)) =
-            Inst::get_replicated_vector_pattern(const_data, ScalarSize::Size64)
-        {
-            Inst::load_replicated_vector_pattern(
-                rd,
-                pattern,
-                VectorSize::from_lane_size(size, true),
-                alloc_tmp,
-            )
-        } else {
-            smallvec![Inst::LoadFpuConst128 { rd, const_data }]
-        }
-    }
-
-    /// Determine whether a 128-bit constant represents a vector consisting of elements with
-    /// the same value.
-    pub fn get_replicated_vector_pattern(
-        value: u128,
-        size: ScalarSize,
-    ) -> Option<(u64, ScalarSize)> {
-        let (mask, shift, next_size) = match size {
-            ScalarSize::Size8 => (u8::MAX as u128, 8, ScalarSize::Size128),
-            ScalarSize::Size16 => (u16::MAX as u128, 16, ScalarSize::Size8),
-            ScalarSize::Size32 => (u32::MAX as u128, 32, ScalarSize::Size16),
-            ScalarSize::Size64 => (u64::MAX as u128, 64, ScalarSize::Size32),
-            _ => return None,
-        };
-        let mut r = None;
-        let v = value & mask;
-
-        if (value >> shift) & mask == v {
-            r = Inst::get_replicated_vector_pattern(v, next_size);
-
-            if r.is_none() {
-                r = Some((v as u64, size));
-            }
-        }
-
-        r
-    }
-
-    /// Create instructions that load a vector constant consisting of elements with
-    /// the same value.
-    pub fn load_replicated_vector_pattern<F: FnMut(Type) -> Writable<Reg>>(
-        rd: Writable<Reg>,
-        pattern: u64,
-        size: VectorSize,
-        mut alloc_tmp: F,
-    ) -> SmallVec<[Inst; 5]> {
-        let lane_size = size.lane_size();
-        let widen_32_bit_pattern = |pattern, lane_size| {
-            if lane_size == ScalarSize::Size32 {
-                let pattern = pattern as u32 as u64;
-
-                ASIMDMovModImm::maybe_from_u64(pattern | (pattern << 32), ScalarSize::Size64)
-            } else {
-                None
-            }
-        };
-
-        if let Some(imm) = ASIMDMovModImm::maybe_from_u64(pattern, lane_size) {
-            smallvec![Inst::VecDupImm {
-                rd,
-                imm,
-                invert: false,
-                size
-            }]
-        } else if let Some(imm) = ASIMDMovModImm::maybe_from_u64(!pattern, lane_size) {
-            debug_assert_ne!(lane_size, ScalarSize::Size8);
-            debug_assert_ne!(lane_size, ScalarSize::Size64);
-
-            smallvec![Inst::VecDupImm {
-                rd,
-                imm,
-                invert: true,
-                size
-            }]
-        } else if let Some(imm) = widen_32_bit_pattern(pattern, lane_size) {
-            let mut insts = smallvec![];
-
-            // TODO: Implement support for 64-bit scalar MOVI; we zero-extend the
-            // lower 64 bits instead.
-            if !size.is_128bits() {
-                let tmp = alloc_tmp(types::I64X2);
-                insts.push(Inst::VecDupImm {
-                    rd: tmp,
-                    imm,
-                    invert: false,
-                    size: VectorSize::Size64x2,
-                });
-                insts.push(Inst::FpuExtend {
-                    rd,
-                    rn: tmp.to_reg(),
-                    size: ScalarSize::Size64,
-                });
-            } else {
-                insts.push(Inst::VecDupImm {
-                    rd,
-                    imm,
-                    invert: false,
-                    size: VectorSize::Size64x2,
-                });
-            }
-
-            insts
-        } else if let Some(imm) = ASIMDFPModImm::maybe_from_u64(pattern, lane_size) {
-            smallvec![Inst::VecDupFPImm { rd, imm, size }]
-        } else {
-            let tmp = alloc_tmp(I64);
-            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern, &mut alloc_tmp)[..]);
-
-            insts.push(Inst::VecDup {
-                rd,
-                rn: tmp.to_reg(),
-                size,
-            });
-
-            insts
-        }
-    }
-
    /// Generic constructor for a load (zero-extending where appropriate).
    pub fn gen_load(into_reg: Writable<Reg>, mem: AMode, ty: Type, flags: MemFlags) -> Inst {
        match ty {
@@ -585,6 +375,7 @@ fn memarg_operands<F: Fn(VReg) -> VReg>(memarg: &AMode, collector: &mut OperandC
        &AMode::RegOffset { rn, .. } => {
            collector.reg_use(rn);
        }
+        &AMode::Const { .. } => {}
    }
 }

@@ -928,9 +719,6 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
            collector.reg_use(rt2);
            pairmemarg_operands(mem, collector);
        }
-        &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => {
-            collector.reg_def(rd);
-        }
        &Inst::FpuToInt { rd, rn, .. } => {
            collector.reg_def(rd);
            collector.reg_use(rn);
@@ -1318,7 +1106,7 @@ impl MachInst for Inst {
 // Pretty-printing of instructions.

 fn mem_finalize_for_show(mem: &AMode, state: &EmitState) -> (String, AMode) {
-    let (mem_insts, mem) = mem_finalize(0, mem, state);
+    let (mem_insts, mem) = mem_finalize(None, mem, state);
    let mut mem_str = mem_insts
        .into_iter()
        .map(|inst| {
@@ -2007,18 +1795,6 @@ impl Inst {

                format!("stp {}, {}, {}", rt, rt2, mem)
            }
-            &Inst::LoadFpuConst64 { rd, const_data } => {
-                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size64, allocs);
-                format!(
-                    "ldr {}, pc+8 ; b 12 ; data.f64 {}",
-                    rd,
-                    f64::from_bits(const_data)
-                )
-            }
-            &Inst::LoadFpuConst128 { rd, const_data } => {
-                let rd = pretty_print_vreg_scalar(rd.to_reg(), ScalarSize::Size128, allocs);
-                format!("ldr {}, pc+8 ; b 20 ; data.f128 0x{:032x}", rd, const_data)
-            }
            &Inst::FpuToInt { op, rd, rn } => {
                let (op, sizesrc, sizedest) = match op {
                    FpuToIntOp::F32ToI32 => ("fcvtzs", ScalarSize::Size32, OperandSize::Size32),
@@ -2820,7 +2596,7 @@ impl Inst {
                // of the existing legalization framework).
                let rd = allocs.next_writable(rd);
                let mem = mem.with_allocs(allocs);
-                let (mem_insts, mem) = mem_finalize(0, &mem, state);
+                let (mem_insts, mem) = mem_finalize(None, &mem, state);
                let mut ret = String::new();
                for inst in mem_insts.into_iter() {
                    ret.push_str(
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -26,7 +26,7 @@

 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(rule (lower (f32const (u64_from_ieee32 n)))
+(rule (lower (f32const (u32_from_ieee32 n)))
      (constant_f32 n))

 ;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -1954,7 +1954,7 @@
 (rule -2 (lower (has_type ty (splat x @ (value_type (ty_scalar_float _)))))
      (vec_dup_from_fpu x (vector_size ty) 0))

-(rule (lower (has_type ty (splat (f32const (u64_from_ieee32 n)))))
+(rule (lower (has_type ty (splat (f32const (u32_from_ieee32 n)))))
      (splat_const n (vector_size ty)))

 (rule (lower (has_type ty (splat (f64const (u64_from_ieee64 n)))))
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -570,67 +570,6 @@ pub(crate) fn lower_constant_u64(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value
    }
 }

-pub(crate) fn lower_constant_f32(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: f32) {
-    let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
-
-    for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
-        ctx.emit(inst);
-    }
-}
-
-pub(crate) fn lower_constant_f64(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: f64) {
-    let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
-
-    for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
-        ctx.emit(inst);
-    }
-}
-
-pub(crate) fn lower_constant_f128(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: u128) {
-    if value == 0 {
-        // Fast-track a common case.  The general case, viz, calling `Inst::load_fp_constant128`,
-        // is potentially expensive.
-        ctx.emit(Inst::VecDupImm {
-            rd,
-            imm: ASIMDMovModImm::zero(ScalarSize::Size8),
-            invert: false,
-            size: VectorSize::Size8x16,
-        });
-    } else {
-        let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
-        for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
-            ctx.emit(inst);
-        }
-    }
-}
-
-pub(crate) fn lower_splat_const(
-    ctx: &mut Lower<Inst>,
-    rd: Writable<Reg>,
-    value: u64,
-    size: VectorSize,
-) {
-    let (value, narrow_size) = match size.lane_size() {
-        ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128),
-        ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8),
-        ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16),
-        ScalarSize::Size64 => (value, ScalarSize::Size32),
-        _ => unreachable!(),
-    };
-    let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) {
-        Some((value, lane_size)) => (
-            value,
-            VectorSize::from_lane_size(lane_size, size.is_128bits()),
-        ),
-        None => (value, size),
-    };
-    let alloc_tmp = |ty| ctx.alloc_tmp(ty).only_reg().unwrap();
-
-    for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
-        ctx.emit(inst);
-    }
-}
-
 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
    match cc {
        IntCC::Equal => Cond::Eq,
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -7,17 +7,16 @@ use smallvec::SmallVec;

 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    fp_reg, lower_condcode, lower_constant_f128, lower_constant_f32, lower_constant_f64,
-    lower_fp_condcode, stack_reg, writable_link_reg, writable_zero_reg, zero_reg, AMode,
-    ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp,
-    FPUOpRI, FPUOpRIMod, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo,
-    MachLabel, MemLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize,
-    PairAMode, Reg, SImm9, ScalarSize, ShiftOpAndAmt, UImm12Scaled, UImm5, VecMisc2, VectorSize,
-    NZCV,
+    fp_reg, lower_condcode, lower_fp_condcode, stack_reg, writable_link_reg, writable_zero_reg,
+    zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond,
+    CondBrKind, ExtendOp, FPUOpRI, FPUOpRIMod, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst,
+    IntCC, JTSequenceInfo, MachLabel, MemLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode,
+    OperandSize, PairAMode, Reg, SImm9, ScalarSize, ShiftOpAndAmt, UImm12Scaled, UImm5, VecMisc2,
+    VectorSize, NZCV,
 };
 use crate::ir::condcodes;
 use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm};
-use crate::isa::aarch64::lower::{lower_address, lower_pair_address, lower_splat_const};
+use crate::isa::aarch64::lower::{lower_address, lower_pair_address};
 use crate::isa::aarch64::AArch64Backend;
 use crate::machinst::valueregs;
 use crate::machinst::{isle::*, InputSourceInst};
@@ -524,38 +523,6 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
        lower_pair_address(self.lower_ctx, addr, offset as i32)
    }

-    fn constant_f32(&mut self, value: u64) -> Reg {
-        let rd = self.temp_writable_reg(I8X16);
-
-        lower_constant_f32(self.lower_ctx, rd, f32::from_bits(value as u32));
-
-        rd.to_reg()
-    }
-
-    fn constant_f64(&mut self, value: u64) -> Reg {
-        let rd = self.temp_writable_reg(I8X16);
-
-        lower_constant_f64(self.lower_ctx, rd, f64::from_bits(value));
-
-        rd.to_reg()
-    }
-
-    fn constant_f128(&mut self, value: u128) -> Reg {
-        let rd = self.temp_writable_reg(I8X16);
-
-        lower_constant_f128(self.lower_ctx, rd, value);
-
-        rd.to_reg()
-    }
-
-    fn splat_const(&mut self, value: u64, size: &VectorSize) -> Reg {
-        let rd = self.temp_writable_reg(I8X16);
-
-        lower_splat_const(self.lower_ctx, rd, value, *size);
-
-        rd.to_reg()
-    }
-
    fn fp_cond_code(&mut self, cc: &condcodes::FloatCC) -> Cond {
        lower_fp_condcode(*cc)
    }
@@ -612,8 +579,6 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
    }

    fn min_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
-        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
-
        if in_bits == 32 {
            // From float32.
            let min = match (signed, out_bits) {
@@ -630,7 +595,7 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
                ),
            };

-            lower_constant_f32(self.lower_ctx, tmp, min);
+            generated_code::constructor_constant_f32(self, min.to_bits())
        } else if in_bits == 64 {
            // From float64.
            let min = match (signed, out_bits) {
@@ -647,7 +612,7 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
                ),
            };

-            lower_constant_f64(self.lower_ctx, tmp, min);
+            generated_code::constructor_constant_f64(self, min.to_bits())
        } else {
            unimplemented!(
                "unexpected input size for min_fp_value: {} (signed: {}, output size: {})",
@@ -656,13 +621,9 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
                out_bits
            );
        }
-
-        tmp.to_reg()
    }

    fn max_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
-        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
-
        if in_bits == 32 {
            // From float32.
            let max = match (signed, out_bits) {
@@ -682,7 +643,7 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
                ),
            };

-            lower_constant_f32(self.lower_ctx, tmp, max);
+            generated_code::constructor_constant_f32(self, max.to_bits())
        } else if in_bits == 64 {
            // From float64.
            let max = match (signed, out_bits) {
@@ -702,7 +663,7 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
                ),
            };

-            lower_constant_f64(self.lower_ctx, tmp, max);
+            generated_code::constructor_constant_f64(self, max.to_bits())
        } else {
            unimplemented!(
                "unexpected input size for max_fp_value: {} (signed: {}, output size: {})",
@@ -711,8 +672,6 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
                out_bits
            );
        }
-
-        tmp.to_reg()
    }

    fn fpu_op_ri_ushr(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI {
@@ -785,4 +744,66 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
            None
        }
    }
+
+    fn asimd_mov_mod_imm_zero(&mut self, size: &ScalarSize) -> ASIMDMovModImm {
+        ASIMDMovModImm::zero(*size)
+    }
+
+    fn asimd_mov_mod_imm_from_u64(
+        &mut self,
+        val: u64,
+        size: &ScalarSize,
+    ) -> Option<ASIMDMovModImm> {
+        ASIMDMovModImm::maybe_from_u64(val, *size)
+    }
+
+    fn asimd_fp_mod_imm_from_u64(&mut self, val: u64, size: &ScalarSize) -> Option<ASIMDFPModImm> {
+        ASIMDFPModImm::maybe_from_u64(val, *size)
+    }
+
+    fn u64_low32_bits_unset(&mut self, val: u64) -> Option<u64> {
+        if val & 0xffffffff == 0 {
+            Some(val)
+        } else {
+            None
+        }
+    }
+
+    fn u128_replicated_u64(&mut self, val: u128) -> Option<u64> {
+        let low64 = val as u64 as u128;
+        if (low64 | (low64 << 64)) == val {
+            Some(low64 as u64)
+        } else {
+            None
+        }
+    }
+
+    fn u64_replicated_u32(&mut self, val: u64) -> Option<u64> {
+        let low32 = val as u32 as u64;
+        if (low32 | (low32 << 32)) == val {
+            Some(low32)
+        } else {
+            None
+        }
+    }
+
+    fn u32_replicated_u16(&mut self, val: u64) -> Option<u64> {
+        let val = val as u32;
+        let low16 = val as u16 as u32;
+        if (low16 | (low16 << 16)) == val {
+            Some(low16.into())
+        } else {
+            None
+        }
+    }
+
+    fn u16_replicated_u8(&mut self, val: u64) -> Option<u64> {
+        let val = val as u16;
+        let low8 = val as u8 as u16;
+        if (low8 | (low8 << 8)) == val {
+            Some(low8.into())
+        } else {
+            None
+        }
+    }
 }
--- a/cranelift/codegen/src/isa/riscv64/lower.isle
+++ b/cranelift/codegen/src/isa/riscv64/lower.isle
@@ -11,7 +11,7 @@

 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(rule (lower (f32const (u64_from_ieee32 n)))
+(rule (lower (f32const (u32_from_ieee32 n)))
  (imm $F32 n))

 ;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/cranelift/codegen/src/isa/s390x/inst.isle
+++ b/cranelift/codegen/src/isa/s390x/inst.isle
@@ -896,7 +896,7 @@
    (CallInd
      (link WritableReg)
      (info BoxCallIndInfo))
-    
+
    ;; A pseudo-instruction that captures register arguments in vregs.
    (Args
      (args VecArgPair))
@@ -1555,8 +1555,8 @@
 (decl u8_as_u16 (u8) u16)
 (extern constructor u8_as_u16 u8_as_u16)

-(decl u64_as_u32 (u64) u32)
-(extern constructor u64_as_u32 u64_as_u32)
+(decl u64_truncate_to_u32 (u64) u32)
+(extern constructor u64_truncate_to_u32 u64_truncate_to_u32)

 (decl u64_as_i16 (u64) i16)
 (extern constructor u64_as_i16 u64_as_i16)
@@ -3000,7 +3000,7 @@
 ;; 32-bit result type, any value
 (rule 5 (imm (gpr32_ty ty) n)
      (let ((dst WritableReg (temp_writable_reg ty))
-            (_ Unit (emit (MInst.Mov32Imm dst (u64_as_u32 n)))))
+            (_ Unit (emit (MInst.Mov32Imm dst (u64_truncate_to_u32 n)))))
        dst))

 ;; 64-bit result type, value fits in i16
@@ -3051,7 +3051,7 @@
 ;; TODO: use LZER to load 0.0
 (rule 8 (imm $F32 n)
      (let ((dst WritableReg (temp_writable_reg $F32))
-            (_ Unit (emit (MInst.LoadFpuConst32 dst (u64_as_u32 n)))))
+            (_ Unit (emit (MInst.LoadFpuConst32 dst (u64_truncate_to_u32 n)))))
        dst))

 ;; 64-bit floating-point type, any value.  Loaded from literal pool.
--- a/cranelift/codegen/src/isa/s390x/lower.isle
+++ b/cranelift/codegen/src/isa/s390x/lower.isle
@@ -18,7 +18,7 @@

 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(rule (lower (f32const (u64_from_ieee32 x)))
+(rule (lower (f32const (u32_from_ieee32 x)))
      (imm $F32 x))


--- a/cranelift/codegen/src/isa/s390x/lower/isle.rs
+++ b/cranelift/codegen/src/isa/s390x/lower/isle.rs
@@ -436,7 +436,7 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, S390xBackend> {
    }

    #[inline]
-    fn u64_as_u32(&mut self, n: u64) -> u32 {
+    fn u64_truncate_to_u32(&mut self, n: u64) -> u32 {
        n as u32
    }

--- a/cranelift/codegen/src/isa/x64/lower.isle
+++ b/cranelift/codegen/src/isa/x64/lower.isle
@@ -24,7 +24,7 @@

 ;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

-(rule (lower (f32const (u64_from_ieee32 x)))
+(rule (lower (f32const (u32_from_ieee32 x)))
      (imm $F32 x))

 ;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
--- a/cranelift/codegen/src/isle_prelude.rs
+++ b/cranelift/codegen/src/isle_prelude.rs
@@ -80,6 +80,11 @@ macro_rules! isle_common_prelude_methods {
            x ^ y
        }

+        #[inline]
+        fn u64_shl(&mut self, x: u64, y: u64) -> u64 {
+            x << y
+        }
+
        #[inline]
        fn imm64_shl(&mut self, ty: Type, x: Imm64, y: Imm64) -> Imm64 {
            // Mask off any excess shift bits.
@@ -502,8 +507,8 @@ macro_rules! isle_common_prelude_methods {
            }
        }

-        fn u64_from_ieee32(&mut self, val: Ieee32) -> u64 {
-            val.bits().into()
+        fn u32_from_ieee32(&mut self, val: Ieee32) -> u32 {
+            val.bits()
        }

        fn u64_from_ieee64(&mut self, val: Ieee64) -> u64 {
@@ -748,5 +753,13 @@ macro_rules! isle_common_prelude_methods {
        fn pack_block_array_2(&mut self, a: BlockCall, b: BlockCall) -> BlockArray2 {
            [a, b]
        }
+
+        fn u128_as_u64(&mut self, val: u128) -> Option<u64> {
+            u64::try_from(val).ok()
+        }
+
+        fn u64_as_u32(&mut self, val: u64) -> Option<u32> {
+            u32::try_from(val).ok()
+        }
    };
 }
--- a/cranelift/codegen/src/prelude.isle
+++ b/cranelift/codegen/src/prelude.isle
@@ -88,10 +88,17 @@

 (decl pure u32_as_u64 (u32) u64)
 (extern constructor u32_as_u64 u32_as_u64)
+(convert u32 u64 u32_as_u64)

 (decl pure i64_as_u64 (i64) u64)
 (extern constructor i64_as_u64 i64_as_u64)

+(decl u128_as_u64 (u64) u128)
+(extern extractor u128_as_u64 u128_as_u64)
+
+(decl u64_as_u32 (u32) u64)
+(extern extractor u64_as_u32 u64_as_u32)
+
 ;;;; Primitive Arithmetic ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

 (decl pure u8_and (u8 u8) u8)
@@ -129,6 +136,9 @@
 (decl pure u64_xor (u64 u64) u64)
 (extern constructor u64_xor u64_xor)

+(decl pure u64_shl (u64 u64) u64)
+(extern constructor u64_shl u64_shl)
+
 (decl pure imm64_shl (Type Imm64 Imm64) Imm64)
 (extern constructor imm64_shl imm64_shl)

@@ -388,8 +398,8 @@
 (extern constructor imm64_masked imm64_masked)

 ;; Extract a `u64` from an `Ieee32`.
-(decl u64_from_ieee32 (u64) Ieee32)
-(extern extractor infallible u64_from_ieee32 u64_from_ieee32)
+(decl u32_from_ieee32 (u32) Ieee32)
+(extern extractor infallible u32_from_ieee32 u32_from_ieee32)

 ;; Extract a `u64` from an `Ieee64`.
 (decl u64_from_ieee64 (u64) Ieee64)