Merge pull request #3679 from FreddieLiardet/fp_const_fmov

Improve code generation for floating-point constants
2022-01-19 09:59:34 -08:00
parent 2649d2352c b5531580e7
commit 4a331b8981
10 changed files with 490 additions and 309 deletions
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -454,6 +454,12 @@
      (rn Reg)
      (size ScalarSize))
    ;; Loads a floating-point immediate.
    (FpuMoveFPImm
      (rd WritableReg)
      (imm ASIMDFPModImm)
      (size ScalarSize))
    ;; Move to a vector element from a GPR.
    (MovToVec
      (rd WritableReg)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1983,6 +1983,19 @@ impl MachInstEmit for Inst {
                };
                sink.put4(template | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()));
            }
            &Inst::FpuMoveFPImm { rd, imm, size } => {
                let size_code = match size {
                    ScalarSize::Size32 => 0b00,
                    ScalarSize::Size64 => 0b01,
                    _ => unimplemented!(),
                };
                sink.put4(
                    0b000_11110_00_1_00_000_000100_00000_00000
                        | size_code << 22
                        | ((imm.enc_bits() as u32) << 13)
                        | machreg_to_vec(rd.to_reg()),
                );
            }
            &Inst::MovToVec { rd, rn, idx, size } => {
                let (imm5, shift) = match size.lane_size() {
                    ScalarSize::Size8 => (0b00001, 1),
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2051,6 +2051,25 @@ fn test_aarch64_binemit() {
        "8103271E",
        "fmov s1, w28",
    ));
    insns.push((
        Inst::FpuMoveFPImm {
            rd: writable_vreg(31),
            imm: ASIMDFPModImm::maybe_from_u64(f64::to_bits(1.0), ScalarSize::Size64).unwrap(),
            size: ScalarSize::Size64,
        },
        "1F106E1E",
        "fmov d31, #1",
    ));
    insns.push((
        Inst::FpuMoveFPImm {
            rd: writable_vreg(1),
            imm: ASIMDFPModImm::maybe_from_u64(f32::to_bits(31.0).into(), ScalarSize::Size32)
                .unwrap(),
            size: ScalarSize::Size32,
        },
        "01F0271E",
        "fmov s1, #31",
    ));
    insns.push((
        Inst::MovToVec {
            rd: writable_vreg(0),
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -239,29 +239,35 @@ impl Inst {
    /// Create instructions that load a 32-bit floating-point constant.
    pub fn load_fp_constant32<F: FnMut(Type) -> Writable<Reg>>(
        rd: Writable<Reg>,
-        value: u32,
+        const_data: u32,
        mut alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
        // Note that we must make sure that all bits outside the lowest 32 are set to 0
        // because this function is also used to load wider constants (that have zeros
        // in their most significant bits).
-        if value == 0 {
+        if const_data == 0 {
            smallvec![Inst::VecDupImm {
                rd,
                imm: ASIMDMovModImm::zero(ScalarSize::Size32),
                invert: false,
-                size: VectorSize::Size32x2
+                size: VectorSize::Size32x2,
            }]
        } else if let Some(imm) =
            ASIMDFPModImm::maybe_from_u64(const_data.into(), ScalarSize::Size32)
        {
            smallvec![Inst::FpuMoveFPImm {
                rd,
                imm,
                size: ScalarSize::Size32,
            }]
        } else {
            // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent
            // bits.
            let tmp = alloc_tmp(I32);
-            let mut insts = Inst::load_constant(tmp, value as u64);
+            let mut insts = Inst::load_constant(tmp, const_data as u64);
            insts.push(Inst::MovToFpu {
                rd,
                rn: tmp.to_reg(),
-                size: ScalarSize::Size64,
+                size: ScalarSize::Size32,
            });
            insts
@@ -277,11 +283,23 @@ impl Inst {
        // Note that we must make sure that all bits outside the lowest 64 are set to 0
        // because this function is also used to load wider constants (that have zeros
        // in their most significant bits).
-        if let Ok(const_data) = u32::try_from(const_data) {
+        // TODO: Treat as half of a 128 bit vector and consider replicated patterns.
        // Scalar MOVI might also be an option.
        if const_data == 0 {
            smallvec![Inst::VecDupImm {
                rd,
                imm: ASIMDMovModImm::zero(ScalarSize::Size32),
                invert: false,
                size: VectorSize::Size32x2,
            }]
        } else if let Some(imm) = ASIMDFPModImm::maybe_from_u64(const_data, ScalarSize::Size64) {
            smallvec![Inst::FpuMoveFPImm {
                rd,
                imm,
                size: ScalarSize::Size64,
            }]
        } else if let Ok(const_data) = u32::try_from(const_data) {
            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
        // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent
        // bits.  Also, treat it as half of a 128-bit vector and consider replicated
        // patterns. Scalar MOVI might also be an option.
        } else if const_data & (u32::MAX as u64) == 0 {
            let tmp = alloc_tmp(I64);
            let mut insts = Inst::load_constant(tmp, const_data);
@@ -879,6 +897,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_def(rd);
            collector.add_use(rn);
        }
        &Inst::FpuMoveFPImm { rd, .. } => {
            collector.add_def(rd);
        }
        &Inst::MovToVec { rd, rn, .. } => {
            collector.add_mod(rd);
            collector.add_use(rn);
@@ -1654,6 +1675,9 @@ pub fn aarch64_map_regs<RM: RegMapper>(inst: &mut Inst, mapper: &RM) {
            mapper.map_def(rd);
            mapper.map_use(rn);
        }
        &mut Inst::FpuMoveFPImm { ref mut rd, .. } => {
            mapper.map_def(rd);
        }
        &mut Inst::MovToVec {
            ref mut rd,
            ref mut rn,
@@ -2693,6 +2717,12 @@ impl Inst {
                let rn = show_ireg_sized(rn, mb_rru, operand_size);
                format!("fmov {}, {}", rd, rn)
            }
            &Inst::FpuMoveFPImm { rd, imm, size } => {
                let imm = imm.show_rru(mb_rru);
                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, size);
                format!("fmov {}, {}", rd, imm)
            }
            &Inst::MovToVec { rd, rn, idx, size } => {
                let rd = show_vreg_element(rd.to_reg(), mb_rru, idx, size);
                let rn = show_ireg_sized(rn, mb_rru, size.operand_size());
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.manifest
@@ -1,4 +1,4 @@
 src/clif.isle f176ef3bba99365
 src/prelude.isle 22dd5ff133398960
-src/isa/aarch64/inst.isle 5fa80451697b084f
+src/isa/aarch64/inst.isle f946561093de4ff5
 src/isa/aarch64/lower.isle 2d2e1e076a0c8a23
--- a/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle/generated_code.rs
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -53,16 +53,11 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
    match op {
        Opcode::Iconst | Opcode::Bconst | Opcode::Null => implemented_in_isle(ctx),
-        Opcode::F32const => {
+        Opcode::F32const | Opcode::F64const => unreachable!(
-            let value = f32::from_bits(ctx.get_constant(insn).unwrap() as u32);
+            "Should never see constant ops at top level lowering entry
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+            point, as constants are rematerialized at use-sites"
-            lower_constant_f32(ctx, rd, value);
+        ),
-        }
+
        Opcode::F64const => {
            let value = f64::from_bits(ctx.get_constant(insn).unwrap());
            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
            lower_constant_f64(ctx, rd, value);
        }
        Opcode::Iadd => implemented_in_isle(ctx),
        Opcode::Isub => implemented_in_isle(ctx),
        Opcode::UaddSat | Opcode::SaddSat | Opcode::UsubSat | Opcode::SsubSat => {
--- a/cranelift/filetests/filetests/isa/aarch64/constants.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/constants.clif
@@ -292,3 +292,124 @@ block0:
 ;   Inst 1:   ret
 ; }}
 function %f() -> f64 {
 block0:
  v0 = f64const 0x1.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fmov d0, #1
 ;   Inst 1:   ret
 ; }}
 function %f() -> f32 {
 block0:
  v0 = f32const 0x5.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fmov s0, #5
 ;   Inst 1:   ret
 ; }}
 function %f() -> f64 {
 block0:
  v0 = f64const 0x32.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 3)
 ;   Inst 0:   movz x0, #16457, LSL #48
 ;   Inst 1:   fmov d0, x0
 ;   Inst 2:   ret
 ; }}
 function %f() -> f32 {
 block0:
  v0 = f32const 0x32.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 3)
 ;   Inst 0:   movz x0, #16968, LSL #16
 ;   Inst 1:   fmov s0, w0
 ;   Inst 2:   ret
 ; }}
 function %f() -> f64 {
 block0:
  v0 = f64const 0x0.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   movi v0.2s, #0
 ;   Inst 1:   ret
 ; }}
 function %f() -> f32 {
 block0:
  v0 = f32const 0x0.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   movi v0.2s, #0
 ;   Inst 1:   ret
 ; }}
 function %f() -> f64 {
 block0:
  v0 = f64const -0x10.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fmov d0, #-16
 ;   Inst 1:   ret
 ; }}
 function %f() -> f32 {
 block0:
  v0 = f32const -0x10.0
  return v0
 }
 ; VCode_ShowWithRRU {{
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 2)
 ;   Inst 0:   fmov s0, #-16
 ;   Inst 1:   ret
 ; }}
--- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
@@ -76,19 +76,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
+;   Inst 2:   fmov s1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp s0, s1
-;   Inst 4:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #17280, LSL #16
-;   Inst 6:   movz x0, #17280, LSL #16
+;   Inst 6:   fmov s1, w0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp s0, s1
-;   Inst 8:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, s0
-;   Inst 10:   fcvtzu w0, s0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function u0:0(f64) -> i8 {
@@ -101,19 +100,18 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
+;   Inst 2:   fmov d1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp d0, d1
-;   Inst 4:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #16496, LSL #48
-;   Inst 6:   movz x0, #16496, LSL #48
+;   Inst 6:   fmov d1, x0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
-;   Inst 8:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, d0
-;   Inst 10:   fcvtzu w0, d0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function u0:0(f32) -> i16 {
@@ -126,19 +124,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
+;   Inst 2:   fmov s1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp s0, s1
-;   Inst 4:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #18304, LSL #16
-;   Inst 6:   movz x0, #18304, LSL #16
+;   Inst 6:   fmov s1, w0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp s0, s1
-;   Inst 8:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, s0
-;   Inst 10:   fcvtzu w0, s0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function u0:0(f64) -> i16 {
@@ -151,18 +148,17 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
+;   Inst 2:   fmov d1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp d0, d1
-;   Inst 4:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #16624, LSL #48
-;   Inst 6:   movz x0, #16624, LSL #48
+;   Inst 6:   fmov d1, x0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
-;   Inst 8:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, d0
-;   Inst 10:   fcvtzu w0, d0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@@ -494,19 +494,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
+;   Inst 2:   fmov s1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp s0, s1
-;   Inst 4:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #20352, LSL #16
-;   Inst 6:   movz x0, #20352, LSL #16
+;   Inst 6:   fmov s1, w0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp s0, s1
-;   Inst 8:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, s0
-;   Inst 10:   fcvtzu w0, s0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function %f34(f32) -> i32 {
@@ -523,11 +522,11 @@ block0(v0: f32):
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
 ;   Inst 2:   movz x0, #52992, LSL #16
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fmov s1, w0
 ;   Inst 4:   fcmp s0, s1
 ;   Inst 5:   b.ge 8 ; udf
 ;   Inst 6:   movz x0, #20224, LSL #16
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fmov s1, w0
 ;   Inst 8:   fcmp s0, s1
 ;   Inst 9:   b.mi 8 ; udf
 ;   Inst 10:   fcvtzs w0, s0
@@ -544,19 +543,18 @@ block0(v0: f32):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49024, LSL #16
+;   Inst 2:   fmov s1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp s0, s1
-;   Inst 4:   fcmp s0, s1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #24448, LSL #16
-;   Inst 6:   movz x0, #24448, LSL #16
+;   Inst 6:   fmov s1, w0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp s0, s1
-;   Inst 8:   fcmp s0, s1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu x0, s0
-;   Inst 10:   fcvtzu x0, s0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function %f36(f32) -> i64 {
@@ -573,11 +571,11 @@ block0(v0: f32):
 ;   Inst 0:   fcmp s0, s0
 ;   Inst 1:   b.vc 8 ; udf
 ;   Inst 2:   movz x0, #57088, LSL #16
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fmov s1, w0
 ;   Inst 4:   fcmp s0, s1
 ;   Inst 5:   b.ge 8 ; udf
 ;   Inst 6:   movz x0, #24320, LSL #16
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fmov s1, w0
 ;   Inst 8:   fcmp s0, s1
 ;   Inst 9:   b.mi 8 ; udf
 ;   Inst 10:   fcvtzs x0, s0
@@ -594,19 +592,18 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
+;   Inst 2:   fmov d1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp d0, d1
-;   Inst 4:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #16880, LSL #48
-;   Inst 6:   movz x0, #16880, LSL #48
+;   Inst 6:   fmov d1, x0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
-;   Inst 8:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu w0, d0
-;   Inst 10:   fcvtzu w0, d0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function %f38(f64) -> i32 {
@@ -643,19 +640,18 @@ block0(v0: f64):
 ;   Entry block: 0
 ; Block 0:
 ;   (original IR block: block0)
-;   (instruction range: 0 .. 12)
+;   (instruction range: 0 .. 11)
 ;   Inst 0:   fcmp d0, d0
 ;   Inst 1:   b.vc 8 ; udf
-;   Inst 2:   movz x0, #49136, LSL #48
+;   Inst 2:   fmov d1, #-1
-;   Inst 3:   fmov d1, x0
+;   Inst 3:   fcmp d0, d1
-;   Inst 4:   fcmp d0, d1
+;   Inst 4:   b.gt 8 ; udf
-;   Inst 5:   b.gt 8 ; udf
+;   Inst 5:   movz x0, #17392, LSL #48
-;   Inst 6:   movz x0, #17392, LSL #48
+;   Inst 6:   fmov d1, x0
-;   Inst 7:   fmov d1, x0
+;   Inst 7:   fcmp d0, d1
-;   Inst 8:   fcmp d0, d1
+;   Inst 8:   b.mi 8 ; udf
-;   Inst 9:   b.mi 8 ; udf
+;   Inst 9:   fcvtzu x0, d0
-;   Inst 10:   fcvtzu x0, d0
+;   Inst 10:   ret
 ;   Inst 11:   ret
 ; }}
 function %f40(f64) -> i64 {
@@ -815,7 +811,7 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 9)
 ;   Inst 0:   movz x0, #20352, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s2, s0, s1
 ;   Inst 3:   movi v1.2s, #0
 ;   Inst 4:   fmax s2, s2, s1
@@ -837,10 +833,10 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 11)
 ;   Inst 0:   movz x0, #20224, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s1, s0, s1
 ;   Inst 3:   movz x0, #52992, LSL #16
-;   Inst 4:   fmov d2, x0
+;   Inst 4:   fmov s2, w0
 ;   Inst 5:   fmax s1, s1, s2
 ;   Inst 6:   movi v2.2s, #0
 ;   Inst 7:   fcmp s0, s0
@@ -861,7 +857,7 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 9)
 ;   Inst 0:   movz x0, #24448, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s2, s0, s1
 ;   Inst 3:   movi v1.2s, #0
 ;   Inst 4:   fmax s2, s2, s1
@@ -883,10 +879,10 @@ block0(v0: f32):
 ;   (original IR block: block0)
 ;   (instruction range: 0 .. 11)
 ;   Inst 0:   movz x0, #24320, LSL #16
-;   Inst 1:   fmov d1, x0
+;   Inst 1:   fmov s1, w0
 ;   Inst 2:   fmin s1, s0, s1
 ;   Inst 3:   movz x0, #57088, LSL #16
-;   Inst 4:   fmov d2, x0
+;   Inst 4:   fmov s2, w0
 ;   Inst 5:   fmax s1, s1, s2
 ;   Inst 6:   movi v2.2s, #0
 ;   Inst 7:   fcmp s0, s0