Merge pull request #2310 from akirilov-arm/vector_constants

Cranelift AArch64: Improve code generation for vector constants
2020-11-01 21:56:40 -08:00
parent 44cbdecea0 207779fe1d
commit d1be8dcfc0
12 changed files with 549 additions and 164 deletions
--- a/cranelift/codegen/src/inst_predicates.rs
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -1,6 +1,7 @@
 //! Instruction predicates/properties, shared by various analyses.

 use crate::ir::{DataFlowGraph, Function, Inst, InstructionData, Opcode};
+use crate::machinst::ty_bits;
 use cranelift_entity::EntityRef;

 /// Preserve instructions with used result values.
@@ -59,7 +60,21 @@ pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option<u64> {
        &InstructionData::UnaryImm { imm, .. } => Some(imm.bits() as u64),
        &InstructionData::UnaryIeee32 { imm, .. } => Some(imm.bits() as u64),
        &InstructionData::UnaryIeee64 { imm, .. } => Some(imm.bits()),
-        &InstructionData::UnaryBool { imm, .. } => Some(if imm { 1 } else { 0 }),
+        &InstructionData::UnaryBool { imm, .. } => {
+            let imm = if imm {
+                let bits = ty_bits(func.dfg.value_type(func.dfg.inst_results(inst)[0]));
+
+                if bits < 64 {
+                    (1u64 << bits) - 1
+                } else {
+                    u64::MAX
+                }
+            } else {
+                0
+            };
+
+            Some(imm)
+        }
        _ => None,
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -609,10 +609,27 @@ pub enum VectorSize {
 }

 impl VectorSize {
+    /// Get the vector operand size with the given scalar size as lane size.
+    pub fn from_lane_size(size: ScalarSize, is_128bit: bool) -> VectorSize {
+        match (size, is_128bit) {
+            (ScalarSize::Size8, false) => VectorSize::Size8x8,
+            (ScalarSize::Size8, true) => VectorSize::Size8x16,
+            (ScalarSize::Size16, false) => VectorSize::Size16x4,
+            (ScalarSize::Size16, true) => VectorSize::Size16x8,
+            (ScalarSize::Size32, false) => VectorSize::Size32x2,
+            (ScalarSize::Size32, true) => VectorSize::Size32x4,
+            (ScalarSize::Size64, true) => VectorSize::Size64x2,
+            _ => panic!("Unexpected scalar FP operand size: {:?}", size),
+        }
+    }
+
    /// Convert from a type into a vector operand size.
    pub fn from_ty(ty: Type) -> VectorSize {
        match ty {
+            B8X16 => VectorSize::Size8x16,
+            B16X8 => VectorSize::Size16x8,
            B32X4 => VectorSize::Size32x4,
+            B64X2 => VectorSize::Size64x2,
            F32X2 => VectorSize::Size32x2,
            F32X4 => VectorSize::Size32x4,
            F64X2 => VectorSize::Size64x2,
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -437,6 +437,21 @@ fn enc_stxr(ty: Type, rs: Writable<Reg>, rt: Reg, rn: Reg) -> u32 {
        | machreg_to_gpr(rt)
 }

+fn enc_asimd_mod_imm(rd: Writable<Reg>, q_op: u32, cmode: u32, imm: u8) -> u32 {
+    let abc = (imm >> 5) as u32;
+    let defgh = (imm & 0b11111) as u32;
+
+    debug_assert_eq!(cmode & 0b1111, cmode);
+    debug_assert_eq!(q_op & 0b11, q_op);
+
+    0b0_0_0_0111100000_000_0000_01_00000_00000
+        | (q_op << 29)
+        | (abc << 16)
+        | (cmode << 12)
+        | (defgh << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
 /// State carried between emissions of a sequence of instructions.
 #[derive(Default, Clone, Debug)]
 pub struct EmitState {
@@ -1588,19 +1603,6 @@ impl MachInstEmit for Inst {
                };
                sink.put4(enc_inttofpu(top16, rd, rn));
            }
-            &Inst::LoadFpuConst32 { rd, const_data } => {
-                let inst = Inst::FpuLoad32 {
-                    rd,
-                    mem: AMode::Label(MemLabel::PCRel(8)),
-                    srcloc: None,
-                };
-                inst.emit(sink, emit_info, state);
-                let inst = Inst::Jump {
-                    dest: BranchTarget::ResolvedOffset(8),
-                };
-                inst.emit(sink, emit_info, state);
-                sink.put4(const_data.to_bits());
-            }
            &Inst::LoadFpuConst64 { rd, const_data } => {
                let inst = Inst::FpuLoad64 {
                    rd,
@@ -1612,7 +1614,7 @@ impl MachInstEmit for Inst {
                    dest: BranchTarget::ResolvedOffset(12),
                };
                inst.emit(sink, emit_info, state);
-                sink.put8(const_data.to_bits());
+                sink.put8(const_data);
            }
            &Inst::LoadFpuConst128 { rd, const_data } => {
                let inst = Inst::FpuLoad128 {
@@ -1751,6 +1753,53 @@ impl MachInstEmit for Inst {
                        | machreg_to_vec(rd.to_reg()),
                );
            }
+            &Inst::VecDupImm {
+                rd,
+                imm,
+                invert,
+                size,
+            } => {
+                let (imm, shift, shift_ones) = imm.value();
+                let (op, cmode) = match size.lane_size() {
+                    ScalarSize::Size8 => {
+                        assert!(!invert);
+                        assert_eq!(shift, 0);
+
+                        (0, 0b1110)
+                    }
+                    ScalarSize::Size16 => {
+                        let s = shift & 8;
+
+                        assert!(!shift_ones);
+                        assert_eq!(s, shift);
+
+                        (invert as u32, 0b1000 | (s >> 2))
+                    }
+                    ScalarSize::Size32 => {
+                        if shift_ones {
+                            assert!(shift == 8 || shift == 16);
+
+                            (invert as u32, 0b1100 | (shift >> 4))
+                        } else {
+                            let s = shift & 24;
+
+                            assert_eq!(s, shift);
+
+                            (invert as u32, 0b0000 | (s >> 2))
+                        }
+                    }
+                    ScalarSize::Size64 => {
+                        assert!(!invert);
+                        assert_eq!(shift, 0);
+
+                        (1, 0b1110)
+                    }
+                    _ => unreachable!(),
+                };
+                let q_op = op | ((size.is_128bits() as u32) << 1);
+
+                sink.put4(enc_asimd_mod_imm(rd, q_op, cmode, imm));
+            }
            &Inst::VecExtend {
                t,
                rd,
@@ -1803,8 +1852,8 @@ impl MachInstEmit for Inst {
            &Inst::VecMovElement {
                rd,
                rn,
-                idx1,
-                idx2,
+                dest_idx,
+                src_idx,
                size,
            } => {
                let (imm5, shift) = match size.lane_size() {
@@ -1815,10 +1864,10 @@ impl MachInstEmit for Inst {
                    _ => unreachable!(),
                };
                let mask = 0b11111 >> shift;
-                debug_assert_eq!(idx1 & mask, idx1);
-                debug_assert_eq!(idx2 & mask, idx2);
-                let imm4 = (idx2 as u32) << (shift - 1);
-                let imm5 = imm5 | ((idx1 as u32) << shift);
+                debug_assert_eq!(dest_idx & mask, dest_idx);
+                debug_assert_eq!(src_idx & mask, src_idx);
+                let imm4 = (src_idx as u32) << (shift - 1);
+                let imm5 = imm5 | ((dest_idx as u32) << shift);
                sink.put4(
                    0b011_01110000_00000_0_0000_1_00000_00000
                        | (imm5 << 16)
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -2034,6 +2034,26 @@ fn test_aarch64_binemit() {
        "5205084E",
        "dup v18.2d, v10.d[0]",
    ));
+    insns.push((
+        Inst::VecDupImm {
+            rd: writable_vreg(31),
+            imm: ASIMDMovModImm::maybe_from_u64(255, ScalarSize::Size8).unwrap(),
+            invert: false,
+            size: VectorSize::Size8x16,
+        },
+        "FFE7074F",
+        "movi v31.16b, #255",
+    ));
+    insns.push((
+        Inst::VecDupImm {
+            rd: writable_vreg(0),
+            imm: ASIMDMovModImm::zero(),
+            invert: true,
+            size: VectorSize::Size16x4,
+        },
+        "0084002F",
+        "mvni v0.4h, #0",
+    ));
    insns.push((
        Inst::VecExtend {
            t: VecExtendOp::Sxtl8,
@@ -2099,8 +2119,8 @@ fn test_aarch64_binemit() {
        Inst::VecMovElement {
            rd: writable_vreg(0),
            rn: vreg(31),
-            idx1: 7,
-            idx2: 7,
+            dest_idx: 7,
+            src_idx: 7,
            size: VectorSize::Size16x8,
        },
        "E0771E6E",
@@ -2111,8 +2131,8 @@ fn test_aarch64_binemit() {
        Inst::VecMovElement {
            rd: writable_vreg(31),
            rn: vreg(16),
-            idx1: 1,
-            idx2: 0,
+            dest_idx: 1,
+            src_idx: 0,
            size: VectorSize::Size32x2,
        },
        "1F060C6E",
@@ -4781,19 +4801,10 @@ fn test_aarch64_binemit() {
        "str q16, [x8, x9, LSL #4]",
    ));

-    insns.push((
-        Inst::LoadFpuConst32 {
-            rd: writable_vreg(16),
-            const_data: 1.0,
-        },
-        "5000001C020000140000803F",
-        "ldr s16, pc+8 ; b 8 ; data.f32 1",
-    ));
-
    insns.push((
        Inst::LoadFpuConst64 {
            rd: writable_vreg(16),
-            const_data: 1.0,
+            const_data: 1.0_f64.to_bits(),
        },
        "5000005C03000014000000000000F03F",
        "ldr d16, pc+8 ; b 12 ; data.f64 1",
--- a/cranelift/codegen/src/isa/aarch64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -4,7 +4,7 @@
 #[allow(dead_code)]
 use crate::ir::types::*;
 use crate::ir::Type;
-use crate::isa::aarch64::inst::OperandSize;
+use crate::isa::aarch64::inst::{OperandSize, ScalarSize};

 use regalloc::{PrettyPrint, RealRegUniverse};

@@ -667,6 +667,40 @@ impl MoveWideConst {
    }
 }

+/// Advanced SIMD modified immediate as used by MOVI/MVNI.
+#[derive(Clone, Copy, Debug)]
+pub struct ASIMDMovModImm {
+    imm: u8,
+    shift: u8,
+    shift_ones: bool,
+}
+
+impl ASIMDMovModImm {
+    pub fn maybe_from_u64(value: u64, size: ScalarSize) -> Option<ASIMDMovModImm> {
+        match size {
+            ScalarSize::Size8 => Some(ASIMDMovModImm {
+                imm: value as u8,
+                shift: 0,
+                shift_ones: false,
+            }),
+            _ => None,
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> Self {
+        ASIMDMovModImm {
+            imm: 0,
+            shift: 0,
+            shift_ones: false,
+        }
+    }
+
+    pub fn value(&self) -> (u8, u32, bool) {
+        (self.imm, self.shift as u32, self.shift_ones)
+    }
+}
+
 impl PrettyPrint for NZCV {
    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
        let fmt = |c: char, v| if v { c.to_ascii_uppercase() } else { c };
@@ -746,6 +780,17 @@ impl PrettyPrint for MoveWideConst {
    }
 }

+impl PrettyPrint for ASIMDMovModImm {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.shift == 0 {
+            format!("#{}", self.imm)
+        } else {
+            let shift_type = if self.shift_ones { "MSL" } else { "LSL" };
+            format!("#{}, {} #{}", self.imm, shift_type, self.shift)
+        }
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -18,6 +18,7 @@ use regalloc::{RegUsageCollector, RegUsageMapper};

 use alloc::boxed::Box;
 use alloc::vec::Vec;
+use core::convert::TryFrom;
 use smallvec::{smallvec, SmallVec};
 use std::string::{String, ToString};

@@ -826,14 +827,9 @@ pub enum Inst {
        srcloc: Option<SourceLoc>,
    },

-    LoadFpuConst32 {
-        rd: Writable<Reg>,
-        const_data: f32,
-    },
-
    LoadFpuConst64 {
        rd: Writable<Reg>,
-        const_data: f64,
+        const_data: u64,
    },

    LoadFpuConst128 {
@@ -922,6 +918,14 @@ pub enum Inst {
        size: VectorSize,
    },

+    /// Duplicate immediate to vector.
+    VecDupImm {
+        rd: Writable<Reg>,
+        imm: ASIMDMovModImm,
+        invert: bool,
+        size: VectorSize,
+    },
+
    /// Vector extend.
    VecExtend {
        t: VecExtendOp,
@@ -934,8 +938,8 @@ pub enum Inst {
    VecMovElement {
        rd: Writable<Reg>,
        rn: Reg,
-        idx1: u8,
-        idx2: u8,
+        dest_idx: u8,
+        src_idx: u8,
        size: VectorSize,
    },

@@ -1297,29 +1301,146 @@ impl Inst {
        }
    }

-    /// Create an instruction that loads a 32-bit floating-point constant.
-    pub fn load_fp_constant32(rd: Writable<Reg>, value: f32) -> Inst {
-        // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
-        Inst::LoadFpuConst32 {
-            rd,
-            const_data: value,
+    /// Create instructions that load a 32-bit floating-point constant.
+    pub fn load_fp_constant32<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        value: u32,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
+        if value == 0 {
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm: ASIMDMovModImm::zero(),
+                invert: false,
+                size: VectorSize::Size8x8
+            }]
+        } else {
+            // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
+            let tmp = alloc_tmp(RegClass::I64, I32);
+            let mut insts = Inst::load_constant(tmp, value as u64);
+
+            insts.push(Inst::MovToFpu {
+                rd,
+                rn: tmp.to_reg(),
+            });
+
+            insts
        }
    }

-    /// Create an instruction that loads a 64-bit floating-point constant.
-    pub fn load_fp_constant64(rd: Writable<Reg>, value: f64) -> Inst {
-        // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
-        Inst::LoadFpuConst64 {
-            rd,
-            const_data: value,
+    /// Create instructions that load a 64-bit floating-point constant.
+    pub fn load_fp_constant64<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        const_data: u64,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
+        if let Ok(const_data) = u32::try_from(const_data) {
+            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
+        // TODO: use FMOV immediate form when `const_data` has sufficiently few mantissa/exponent bits.
+        // Also, treat it as half of a 128-bit vector and consider replicated patterns. Scalar MOVI
+        // might also be an option.
+        } else if const_data & (u32::MAX as u64) == 0 {
+            let tmp = alloc_tmp(RegClass::I64, I64);
+            let mut insts = Inst::load_constant(tmp, const_data);
+
+            insts.push(Inst::MovToFpu {
+                rd,
+                rn: tmp.to_reg(),
+            });
+
+            insts
+        } else {
+            smallvec![Inst::LoadFpuConst64 { rd, const_data }]
        }
    }

-    /// Create an instruction that loads a 128-bit vector constant.
-    pub fn load_fp_constant128(rd: Writable<Reg>, value: u128) -> Inst {
-        Inst::LoadFpuConst128 {
-            rd,
-            const_data: value,
+    /// Create instructions that load a 128-bit vector constant.
+    pub fn load_fp_constant128<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        const_data: u128,
+        alloc_tmp: F,
+    ) -> SmallVec<[Inst; 5]> {
+        if let Ok(const_data) = u64::try_from(const_data) {
+            SmallVec::from(&Inst::load_fp_constant64(rd, const_data, alloc_tmp)[..])
+        } else if let Some((pattern, size)) =
+            Inst::get_replicated_vector_pattern(const_data, ScalarSize::Size64)
+        {
+            Inst::load_replicated_vector_pattern(
+                rd,
+                pattern,
+                VectorSize::from_lane_size(size, true),
+                alloc_tmp,
+            )
+        } else {
+            smallvec![Inst::LoadFpuConst128 { rd, const_data }]
+        }
+    }
+
+    /// Determine whether a 128-bit constant represents a vector consisting of elements with
+    /// the same value.
+    pub fn get_replicated_vector_pattern(
+        value: u128,
+        size: ScalarSize,
+    ) -> Option<(u64, ScalarSize)> {
+        let (mask, shift, next_size) = match size {
+            ScalarSize::Size8 => (u8::MAX as u128, 8, ScalarSize::Size128),
+            ScalarSize::Size16 => (u16::MAX as u128, 16, ScalarSize::Size8),
+            ScalarSize::Size32 => (u32::MAX as u128, 32, ScalarSize::Size16),
+            ScalarSize::Size64 => (u64::MAX as u128, 64, ScalarSize::Size32),
+            _ => return None,
+        };
+        let mut r = None;
+        let v = value & mask;
+
+        if (value >> shift) & mask == v {
+            r = Inst::get_replicated_vector_pattern(v, next_size);
+
+            if r.is_none() {
+                r = Some((v as u64, size));
+            }
+        }
+
+        r
+    }
+
+    /// Create instructions that load a 128-bit vector constant consisting of elements with
+    /// the same value.
+    pub fn load_replicated_vector_pattern<F: FnMut(RegClass, Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        pattern: u64,
+        size: VectorSize,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 5]> {
+        let lane_size = size.lane_size();
+
+        if let Some(imm) = ASIMDMovModImm::maybe_from_u64(pattern, lane_size) {
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm,
+                invert: false,
+                size
+            }]
+        } else if let Some(imm) = ASIMDMovModImm::maybe_from_u64(!pattern, lane_size) {
+            debug_assert_ne!(lane_size, ScalarSize::Size8);
+            debug_assert_ne!(lane_size, ScalarSize::Size64);
+
+            smallvec![Inst::VecDupImm {
+                rd,
+                imm,
+                invert: true,
+                size
+            }]
+        } else {
+            let tmp = alloc_tmp(RegClass::I64, I64);
+            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
+
+            insts.push(Inst::VecDup {
+                rd,
+                rn: tmp.to_reg(),
+                size,
+            });
+
+            insts
        }
    }

@@ -1704,9 +1825,7 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_use(rd);
            memarg_regs(mem, collector);
        }
-        &Inst::LoadFpuConst32 { rd, .. }
-        | &Inst::LoadFpuConst64 { rd, .. }
-        | &Inst::LoadFpuConst128 { rd, .. } => {
+        &Inst::LoadFpuConst64 { rd, .. } | &Inst::LoadFpuConst128 { rd, .. } => {
            collector.add_def(rd);
        }
        &Inst::FpuToInt { rd, rn, .. } => {
@@ -1746,6 +1865,9 @@ fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
            collector.add_def(rd);
            collector.add_use(rn);
        }
+        &Inst::VecDupImm { rd, .. } => {
+            collector.add_def(rd);
+        }
        &Inst::VecExtend { rd, rn, .. } => {
            collector.add_def(rd);
            collector.add_use(rn);
@@ -2344,9 +2466,6 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            map_use(mapper, rd);
            map_mem(mapper, mem);
        }
-        &mut Inst::LoadFpuConst32 { ref mut rd, .. } => {
-            map_def(mapper, rd);
-        }
        &mut Inst::LoadFpuConst64 { ref mut rd, .. } => {
            map_def(mapper, rd);
        }
@@ -2441,6 +2560,9 @@ fn aarch64_map_regs<RUM: RegUsageMapper>(inst: &mut Inst, mapper: &RUM) {
            map_def(mapper, rd);
            map_use(mapper, rn);
        }
+        &mut Inst::VecDupImm { ref mut rd, .. } => {
+            map_def(mapper, rd);
+        }
        &mut Inst::VecExtend {
            ref mut rd,
            ref mut rn,
@@ -2631,19 +2753,12 @@ impl MachInst for Inst {
        to_reg: Writable<Reg>,
        value: u64,
        ty: Type,
-        _alloc_tmp: F,
+        alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
        if ty == F64 {
-            let mut ret = SmallVec::new();
-            ret.push(Inst::load_fp_constant64(to_reg, f64::from_bits(value)));
-            ret
+            Inst::load_fp_constant64(to_reg, value, alloc_tmp)
        } else if ty == F32 {
-            let mut ret = SmallVec::new();
-            ret.push(Inst::load_fp_constant32(
-                to_reg,
-                f32::from_bits(value as u32),
-            ));
-            ret
+            Inst::load_fp_constant32(to_reg, value as u32, alloc_tmp)
        } else {
            // Must be an integer type.
            debug_assert!(
@@ -3216,13 +3331,9 @@ impl Inst {
                let mem = mem.show_rru(mb_rru);
                format!("{}str {}, {}", mem_str, rd, mem)
            }
-            &Inst::LoadFpuConst32 { rd, const_data } => {
-                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size32);
-                format!("ldr {}, pc+8 ; b 8 ; data.f32 {}", rd, const_data)
-            }
            &Inst::LoadFpuConst64 { rd, const_data } => {
                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size64);
-                format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data)
+                format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, f64::from_bits(const_data))
            }
            &Inst::LoadFpuConst128 { rd, const_data } => {
                let rd = show_vreg_scalar(rd.to_reg(), mb_rru, ScalarSize::Size128);
@@ -3330,6 +3441,17 @@ impl Inst {
                let rn = show_vreg_element(rn, mb_rru, 0, size);
                format!("dup {}, {}", rd, rn)
            }
+            &Inst::VecDupImm { rd, imm, invert, size } => {
+                let imm = imm.show_rru(mb_rru);
+                let op = if invert {
+                    "mvni"
+                } else {
+                    "movi"
+                };
+                let rd = show_vreg_vector(rd.to_reg(), mb_rru, size);
+
+                format!("{} {}, {}", op, rd, imm)
+            }
            &Inst::VecExtend { t, rd, rn, high_half } => {
                let (op, dest, src) = match (t, high_half) {
                    (VecExtendOp::Sxtl8, false) => ("sxtl", VectorSize::Size16x8, VectorSize::Size8x8),
@@ -3352,12 +3474,12 @@ impl Inst {
            &Inst::VecMovElement {
                rd,
                rn,
-                idx1,
-                idx2,
+                dest_idx,
+                src_idx,
                size,
            } => {
-                let rd = show_vreg_element(rd.to_reg(), mb_rru, idx1, size);
-                let rn = show_vreg_element(rn, mb_rru, idx2, size);
+                let rd = show_vreg_element(rd.to_reg(), mb_rru, dest_idx, size);
+                let rn = show_vreg_element(rn, mb_rru, src_idx, size);
                format!("mov {}, {}", rd, rn)
            }
            &Inst::VecMiscNarrow { op, rd, rn, size, high_half } => {
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -813,7 +813,11 @@ pub(crate) fn lower_constant_f32<C: LowerCtx<I = Inst>>(
    rd: Writable<Reg>,
    value: f32,
 ) {
-    ctx.emit(Inst::load_fp_constant32(rd, value));
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant32(rd, value.to_bits(), alloc_tmp) {
+        ctx.emit(inst);
+    }
 }

 pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
@@ -821,7 +825,11 @@ pub(crate) fn lower_constant_f64<C: LowerCtx<I = Inst>>(
    rd: Writable<Reg>,
    value: f64,
 ) {
-    ctx.emit(Inst::load_fp_constant64(rd, value));
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant64(rd, value.to_bits(), alloc_tmp) {
+        ctx.emit(inst);
+    }
 }

 pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
@@ -829,7 +837,38 @@ pub(crate) fn lower_constant_f128<C: LowerCtx<I = Inst>>(
    rd: Writable<Reg>,
    value: u128,
 ) {
-    ctx.emit(Inst::load_fp_constant128(rd, value));
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_fp_constant128(rd, value, alloc_tmp) {
+        ctx.emit(inst);
+    }
+}
+
+pub(crate) fn lower_splat_const<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    rd: Writable<Reg>,
+    value: u64,
+    size: VectorSize,
+) {
+    let (value, narrow_size) = match size.lane_size() {
+        ScalarSize::Size8 => (value as u8 as u64, ScalarSize::Size128),
+        ScalarSize::Size16 => (value as u16 as u64, ScalarSize::Size8),
+        ScalarSize::Size32 => (value as u32 as u64, ScalarSize::Size16),
+        ScalarSize::Size64 => (value, ScalarSize::Size32),
+        _ => unreachable!(),
+    };
+    let (value, size) = match Inst::get_replicated_vector_pattern(value as u128, narrow_size) {
+        Some((value, lane_size)) => (
+            value,
+            VectorSize::from_lane_size(lane_size, size.is_128bits()),
+        ),
+        None => (value, size),
+    };
+    let alloc_tmp = |class, ty| ctx.alloc_tmp(class, ty);
+
+    for inst in Inst::load_replicated_vector_pattern(rd, value, size, alloc_tmp) {
+        ctx.emit(inst);
+    }
 }

 pub(crate) fn lower_condcode(cc: IntCC) -> Cond {
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2013,24 +2013,47 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                ctx.emit(Inst::VecMovElement {
                    rd,
                    rn,
-                    idx1: idx,
-                    idx2: 0,
+                    dest_idx: idx,
+                    src_idx: 0,
                    size,
                });
            }
        }

        Opcode::Splat => {
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
            let rd = get_output_reg(ctx, outputs[0]);
-            let input_ty = ctx.input_ty(insn, 0);
            let size = VectorSize::from_ty(ty.unwrap());
-            let inst = if ty_has_int_representation(input_ty) {
-                Inst::VecDup { rd, rn, size }
+
+            if let Some((_, insn)) = maybe_input_insn_multi(
+                ctx,
+                inputs[0],
+                &[
+                    Opcode::Bconst,
+                    Opcode::F32const,
+                    Opcode::F64const,
+                    Opcode::Iconst,
+                ],
+            ) {
+                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
+            } else if let Some(insn) =
+                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Iconst, Opcode::Ireduce)
+            {
+                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
+            } else if let Some(insn) =
+                maybe_input_insn_via_conv(ctx, inputs[0], Opcode::Bconst, Opcode::Breduce)
+            {
+                lower_splat_const(ctx, rd, ctx.get_constant(insn).unwrap(), size);
            } else {
-                Inst::VecDupFromFpu { rd, rn, size }
-            };
-            ctx.emit(inst);
+                let input_ty = ctx.input_ty(insn, 0);
+                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
+                let inst = if ty_has_int_representation(input_ty) {
+                    Inst::VecDup { rd, rn, size }
+                } else {
+                    Inst::VecDupFromFpu { rd, rn, size }
+                };
+
+                ctx.emit(inst);
+            }
        }

        Opcode::VanyTrue | Opcode::VallTrue => {
@@ -2820,15 +2843,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                let rtmp2 = ctx.alloc_tmp(RegClass::V128, in_ty);

                if in_bits == 32 {
-                    ctx.emit(Inst::LoadFpuConst32 {
-                        rd: rtmp1,
-                        const_data: max as f32,
-                    });
+                    lower_constant_f32(ctx, rtmp1, max as f32);
                } else {
-                    ctx.emit(Inst::LoadFpuConst64 {
-                        rd: rtmp1,
-                        const_data: max,
-                    });
+                    lower_constant_f64(ctx, rtmp1, max);
                }
                ctx.emit(Inst::FpuRRR {
                    fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
@@ -2837,15 +2854,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                    rm: rtmp1.to_reg(),
                });
                if in_bits == 32 {
-                    ctx.emit(Inst::LoadFpuConst32 {
-                        rd: rtmp1,
-                        const_data: min as f32,
-                    });
+                    lower_constant_f32(ctx, rtmp1, min as f32);
                } else {
-                    ctx.emit(Inst::LoadFpuConst64 {
-                        rd: rtmp1,
-                        const_data: min,
-                    });
+                    lower_constant_f64(ctx, rtmp1, min);
                }
                ctx.emit(Inst::FpuRRR {
                    fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
@@ -2855,15 +2866,9 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
                });
                if out_signed {
                    if in_bits == 32 {
-                        ctx.emit(Inst::LoadFpuConst32 {
-                            rd: rtmp1,
-                            const_data: 0.0,
-                        });
+                        lower_constant_f32(ctx, rtmp1, 0.0);
                    } else {
-                        ctx.emit(Inst::LoadFpuConst64 {
-                            rd: rtmp1,
-                            const_data: 0.0,
-                        });
+                        lower_constant_f64(ctx, rtmp1, 0.0);
                    }
                }
                if in_bits == 32 {
--- a/cranelift/filetests/filetests/isa/aarch64/constants.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/constants.clif
@@ -9,7 +9,7 @@ block0:

 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
-; nextln: movz x0, #1
+; nextln: movz x0, #255
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
 ; nextln: ret
--- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
@@ -60,10 +60,12 @@ block0(v0: f32):
    v1 = fcvt_to_uint.i8 v0
    ; check: fcmp s0, s0
    ; check: b.vc 8 ; udf
-    ; check: ldr s1, pc+8 ; b 8 ; data.f32 -1
+    ; check: movz x0, #49024, LSL #16
+    ; check: fmov d1, x0
    ; check: fcmp s0, s1
    ; check: b.gt 8 ; udf
-    ; check: ldr s1, pc+8 ; b 8 ; data.f32 256
+    ; check: movz x0, #17280, LSL #16
+    ; check: fmov d1, x0
    ; check: fcmp s0, s1
    ; check: b.mi 8 ; udf
    ; check: fcvtzu w0, s0
@@ -80,10 +82,12 @@ block0(v0: f64):
    v1 = fcvt_to_uint.i8 v0
    ; check: fcmp d0, d0
    ; check: b.vc 8 ; udf
-    ; check: ldr d1, pc+8 ; b 12 ; data.f64 -1
+    ; check: movz x0, #49136, LSL #48
+    ; check: fmov d1, x0
    ; check: fcmp d0, d1
    ; check: b.gt 8 ; udf
-    ; check: ldr d1, pc+8 ; b 12 ; data.f64 256
+    ; check: movz x0, #16496, LSL #48
+    ; check: fmov d1, x0
    ; check: fcmp d0, d1
    ; check: b.mi 8 ; udf
    ; check: fcvtzu w0, d0
@@ -100,10 +104,12 @@ block0(v0: f32):
    v1 = fcvt_to_uint.i16 v0
    ; check: fcmp s0, s0
    ; check: b.vc 8 ; udf
-    ; check: ldr s1, pc+8 ; b 8 ; data.f32 -1
+    ; check: movz x0, #49024, LSL #16
+    ; check: fmov d1, x0
    ; check: fcmp s0, s1
    ; check: b.gt 8 ; udf
-    ; check: ldr s1, pc+8 ; b 8 ; data.f32 65536
+    ; check: movz x0, #18304, LSL #16
+    ; check: fmov d1, x0
    ; check: fcmp s0, s1
    ; check: b.mi 8 ; udf
    ; check: fcvtzu w0, s0
@@ -120,10 +126,12 @@ block0(v0: f64):
    v1 = fcvt_to_uint.i16 v0
    ; check: fcmp d0, d0
    ; check: b.vc 8 ; udf
-    ; check: ldr d1, pc+8 ; b 12 ; data.f64 -1
+    ; check: movz x0, #49136, LSL #48
+    ; check: fmov d1, x0
    ; check: fcmp d0, d1
    ; check: b.gt 8 ; udf
-    ; check: ldr d1, pc+8 ; b 12 ; data.f64 65536
+    ; check: movz x0, #16624, LSL #48
+    ; check: fmov d1, x0
    ; check: fcmp d0, d1
    ; check: b.mi 8 ; udf
    ; check: fcvtzu w0, d0
--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@@ -427,10 +427,12 @@ block0(v0: f32):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp s0, s0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 -1
+; nextln:  movz x0, #49024, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.gt 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 4294967300
+; nextln:  movz x0, #20352, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzu w0, s0
@@ -448,10 +450,12 @@ block0(v0: f32):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp s0, s0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 -2147483600
+; nextln:  movz x0, #52992, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.ge 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 2147483600
+; nextln:  movz x0, #20224, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzs w0, s0
@@ -469,10 +473,12 @@ block0(v0: f32):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp s0, s0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 -1
+; nextln:  movz x0, #49024, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.gt 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 18446744000000000000
+; nextln:  movz x0, #24448, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzu x0, s0
@@ -490,10 +496,12 @@ block0(v0: f32):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp s0, s0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 -9223372000000000000
+; nextln:  movz x0, #57088, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.ge 8 ; udf
-; nextln:  ldr s1, pc+8 ; b 8 ; data.f32 9223372000000000000
+; nextln:  movz x0, #24320, LSL #16
+; nextln:  fmov d1, x0
 ; nextln:  fcmp s0, s1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzs x0, s0
@@ -511,10 +519,12 @@ block0(v0: f64):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp d0, d0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 -1
+; nextln:  movz x0, #49136, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.gt 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 4294967296
+; nextln:  movz x0, #16880, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzu w0, d0
@@ -535,7 +545,8 @@ block0(v0: f64):
 ; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 -2147483649
 ; nextln:  fcmp d0, d1
 ; nextln:  b.gt 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 2147483648
+; nextln:  movz x0, #16864, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzs w0, d0
@@ -553,10 +564,12 @@ block0(v0: f64):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp d0, d0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 -1
+; nextln:  movz x0, #49136, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.gt 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 18446744073709552000
+; nextln:  movz x0, #17392, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzu x0, d0
@@ -574,10 +587,12 @@ block0(v0: f64):
 ; nextln:  mov fp, sp
 ; nextln:  fcmp d0, d0
 ; nextln:  b.vc 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 -9223372036854776000
+; nextln:  movz x0, #50144, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.ge 8 ; udf
-; nextln:  ldr d1, pc+8 ; b 12 ; data.f64 9223372036854776000
+; nextln:  movz x0, #17376, LSL #48
+; nextln:  fmov d1, x0
 ; nextln:  fcmp d0, d1
 ; nextln:  b.mi 8 ; udf
 ; nextln:  fcvtzs x0, d0
@@ -697,9 +712,10 @@ block0(v0: f32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln: ldr s1, pc+8 ; b 8 ; data.f32 4294967300
+; nextln: movz x0, #20352, LSL #16
+; nextln: fmov d1, x0
 ; nextln: fmin s2, s0, s1
-; nextln: ldr s1, pc+8 ; b 8 ; data.f32 0
+; nextln: movi v1.8b, #0
 ; nextln: fmax s2, s2, s1
 ; nextln: fcmp s0, s0
 ; nextln: fcsel s0, s1, s2, ne
@@ -716,11 +732,13 @@ block0(v0: f32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln: ldr s1, pc+8 ; b 8 ; data.f32 2147483600
+; nextln: movz x0, #20224, LSL #16
+; nextln: fmov d1, x0
 ; nextln: fmin s1, s0, s1
-; nextln: ldr s2, pc+8 ; b 8 ; data.f32 -2147483600
+; nextln: movz x0, #52992, LSL #16
+; nextln: fmov d2, x0
 ; nextln: fmax s1, s1, s2
-; nextln: ldr s2, pc+8 ; b 8 ; data.f32 0
+; nextln: movi v2.8b, #0
 ; nextln: fcmp s0, s0
 ; nextln: fcsel s0, s2, s1, ne
 ; nextln: fcvtzs w0, s0
@@ -736,9 +754,10 @@ block0(v0: f32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln: ldr s1, pc+8 ; b 8 ; data.f32 18446744000000000000
+; nextln: movz x0, #24448, LSL #16
+; nextln: fmov d1, x0
 ; nextln: fmin s2, s0, s1
-; nextln: ldr s1, pc+8 ; b 8 ; data.f32 0
+; nextln: movi v1.8b, #0
 ; nextln: fmax s2, s2, s1
 ; nextln: fcmp s0, s0
 ; nextln: fcsel s0, s1, s2, ne
@@ -755,11 +774,13 @@ block0(v0: f32):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln: ldr s1, pc+8 ; b 8 ; data.f32 9223372000000000000
+; nextln: movz x0, #24320, LSL #16
+; nextln: fmov d1, x0
 ; nextln: fmin s1, s0, s1
-; nextln: ldr s2, pc+8 ; b 8 ; data.f32 -9223372000000000000
+; nextln: movz x0, #57088, LSL #16
+; nextln: fmov d2, x0
 ; nextln: fmax s1, s1, s2
-; nextln: ldr s2, pc+8 ; b 8 ; data.f32 0
+; nextln: movi v2.8b, #0
 ; nextln: fcmp s0, s0
 ; nextln: fcsel s0, s2, s1, ne
 ; nextln: fcvtzs x0, s0
@@ -777,7 +798,7 @@ block0(v0: f64):
 ; nextln:  mov fp, sp
 ; nextln: ldr d1, pc+8 ; b 12 ; data.f64 4294967295
 ; nextln: fmin d2, d0, d1
-; nextln: ldr d1, pc+8 ; b 12 ; data.f64 0
+; nextln: movi v1.8b, #0
 ; nextln: fmax d2, d2, d1
 ; nextln: fcmp d0, d0
 ; nextln: fcsel d0, d1, d2, ne
@@ -796,9 +817,10 @@ block0(v0: f64):
 ; nextln:  mov fp, sp
 ; nextln: ldr d1, pc+8 ; b 12 ; data.f64 2147483647
 ; nextln: fmin d1, d0, d1
-; nextln: ldr d2, pc+8 ; b 12 ; data.f64 -2147483648
+; nextln: movz x0, #49632, LSL #48
+; nextln: fmov d2, x0
 ; nextln: fmax d1, d1, d2
-; nextln: ldr d2, pc+8 ; b 12 ; data.f64 0
+; nextln: movi v2.8b, #0
 ; nextln: fcmp d0, d0
 ; nextln: fcsel d0, d2, d1, ne
 ; nextln: fcvtzs w0, d0
@@ -814,9 +836,10 @@ block0(v0: f64):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln: ldr d1, pc+8 ; b 12 ; data.f64 18446744073709552000
+; nextln: movz x0, #17392, LSL #48
+; nextln: fmov d1, x0
 ; nextln: fmin d2, d0, d1
-; nextln: ldr d1, pc+8 ; b 12 ; data.f64 0
+; nextln: movi v1.8b, #0
 ; nextln: fmax d2, d2, d1
 ; nextln: fcmp d0, d0
 ; nextln: fcsel d0, d1, d2, ne
@@ -833,11 +856,13 @@ block0(v0: f64):

 ; check:  stp fp, lr, [sp, #-16]!
 ; nextln:  mov fp, sp
-; nextln: ldr d1, pc+8 ; b 12 ; data.f64 9223372036854776000
+; nextln: movz x0, #17376, LSL #48
+; nextln: fmov d1, x0
 ; nextln: fmin d1, d0, d1
-; nextln: ldr d2, pc+8 ; b 12 ; data.f64 -9223372036854776000
+; nextln: movz x0, #50144, LSL #48
+; nextln: fmov d2, x0
 ; nextln: fmax d1, d1, d2
-; nextln: ldr d2, pc+8 ; b 12 ; data.f64 0
+; nextln: movi v2.8b, #0
 ; nextln: fcmp d0, d0
 ; nextln: fcsel d0, d2, d1, ne
 ; nextln: fcvtzs x0, d0
--- a/cranelift/filetests/filetests/isa/aarch64/simd.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/simd.clif
@@ -0,0 +1,49 @@
+test compile
+target aarch64
+
+function %f1() -> i64x2 {
+block0:
+  v0 = iconst.i64 281474976710657
+  v1 = splat.i64x2 v0
+  return v1
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x0, #1
+; nextln:  movk x0, #1, LSL #48
+; nextln:  dup v0.2d, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f2() -> i16x8 {
+block0:
+  v0 = iconst.i32 42679
+  v1 = ireduce.i16 v0
+  v2 = splat.i16x8 v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x0, #42679
+; nextln:  dup v0.8h, w0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f3() -> b8x16 {
+block0:
+  v0 = bconst.b32 true
+  v1 = breduce.b8 v0
+  v2 = splat.b8x16 v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movi v0.16b, #255
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret