Allocate temporary intermediates when loading constants on aarch64 (#5366)

As loading constants on aarch64 can take up to 4 instructions, we need to plumb through some additional registers. Rather than pass a fixed list of registers in, pass an allocation function.
2022-12-01 14:29:36 -08:00
parent 03715dda9d
commit d54a27d0ea
11 changed files with 158 additions and 126 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -430,7 +430,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
        } else {
            let scratch2 = writable_tmp2_reg();
            assert_ne!(scratch2.to_reg(), from_reg);
-            insts.extend(Inst::load_constant(scratch2, imm.into()));
+            // `gen_add_imm` is only ever called after register allocation has take place, and as a
+            // result it's ok to reuse the scratch2 register here. If that changes, we'll need to
+            // plumb through a way to allocate temporary virtual registers
+            insts.extend(Inst::load_constant(scratch2, imm.into(), &mut |_| scratch2));
            insts.push(Inst::AluRRRExtend {
                alu_op: ALUOp::Add,
                size: OperandSize::Size64,
@@ -515,7 +518,9 @@ impl ABIMachineSpec for AArch64MachineDeps {
            ret.push(adj_inst);
        } else {
            let tmp = writable_spilltmp_reg();
-            let const_inst = Inst::load_constant(tmp, amount);
+            // `gen_sp_reg_adjust` is called after regalloc2, so it's acceptable to reuse `tmp` for
+            // intermediates in `load_constant`.
+            let const_inst = Inst::load_constant(tmp, amount, &mut |_| tmp);
            let adj_inst = Inst::AluRRRExtend {
                alu_op,
                size: OperandSize::Size64,
@@ -673,8 +678,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
            // itself is not allowed to use the registers.
            let start = writable_spilltmp_reg();
            let end = writable_tmp2_reg();
-            insts.extend(Inst::load_constant(start, 0));
-            insts.extend(Inst::load_constant(end, frame_size.into()));
+            // `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse
+            // `start` and `end` as temporaries in load_constant.
+            insts.extend(Inst::load_constant(start, 0, &mut |_| start));
+            insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end));
            insts.push(Inst::StackProbeLoop {
                start,
                end: end.to_reg(),
@@ -1019,19 +1026,19 @@ impl ABIMachineSpec for AArch64MachineDeps {
        insts
    }

-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
-        tmp: Writable<Reg>,
-        _tmp2: Writable<Reg>,
        size: usize,
+        mut alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]> {
        let mut insts = SmallVec::new();
        let arg0 = writable_xreg(0);
        let arg1 = writable_xreg(1);
        let arg2 = writable_xreg(2);
-        insts.extend(Inst::load_constant(tmp, size as u64).into_iter());
+        let tmp = alloc_tmp(Self::word_type());
+        insts.extend(Inst::load_constant(tmp, size as u64, &mut alloc_tmp));
        insts.push(Inst::Call {
            info: Box::new(CallInfo {
                dest: ExternalName::LibCall(LibCall::Memcpy),
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -64,7 +64,7 @@ pub fn mem_finalize(
            } else {
                let tmp = writable_spilltmp_reg();
                (
-                    Inst::load_constant(tmp, off as u64),
+                    Inst::load_constant(tmp, off as u64, &mut |_| tmp),
                    AMode::RegExtended {
                        rn: basereg,
                        rm: tmp.to_reg(),
@@ -3333,7 +3333,7 @@ impl MachInstEmit for Inst {
                    debug_assert!(rd.to_reg() != tmp2_reg());
                    debug_assert!(reg != tmp2_reg());
                    let tmp = writable_tmp2_reg();
-                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                    for insn in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
                        insn.emit(&[], sink, emit_info, state);
                    }
                    let add = Inst::AluRRR {
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -130,7 +130,11 @@ fn inst_size_test() {
 impl Inst {
    /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
    /// logical immediate, or constant pool).
-    pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
+    pub fn load_constant<F: FnMut(Type) -> Writable<Reg>>(
+        rd: Writable<Reg>,
+        value: u64,
+        alloc_tmp: &mut F,
+    ) -> SmallVec<[Inst; 4]> {
        // NB: this is duplicated in `lower/isle.rs` and `inst.isle` right now,
        // if modifications are made here before this is deleted after moving to
        // ISLE then those locations should be updated as well.
@@ -169,23 +173,40 @@ impl Inst {
            } else {
                (4, OperandSize::Size64, !value)
            };
+
            // If the number of 0xffff half words is greater than the number of 0x0000 half words
            // it is more efficient to use `movn` for the first instruction.
            let first_is_inverted = count_zero_half_words(negated, num_half_words)
                > count_zero_half_words(value, num_half_words);
+
            // Either 0xffff or 0x0000 half words can be skipped, depending on the first
            // instruction used.
            let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
-            let mut first_mov_emitted = false;

-            for i in 0..num_half_words {
+            let halfwords: SmallVec<[_; 4]> = (0..num_half_words)
+                .filter_map(|i| {
                    let imm16 = (value >> (16 * i)) & 0xffff;
-                if imm16 != ignored_halfword {
-                    if !first_mov_emitted {
-                        first_mov_emitted = true;
+                    if imm16 == ignored_halfword {
+                        None
+                    } else {
+                        Some((i, imm16))
+                    }
+                })
+                .collect();
+
+            let mut prev_result = None;
+            let last_index = halfwords.last().unwrap().0;
+            for (i, imm16) in halfwords {
+                let shift = i * 16;
+                let rd = if i == last_index { rd } else { alloc_tmp(I16) };
+
+                if let Some(rn) = prev_result {
+                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
+                    insts.push(Inst::MovK { rd, rn, imm, size });
+                } else {
                    if first_is_inverted {
                        let imm =
-                                MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
+                            MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift)
                                .unwrap();
                        insts.push(Inst::MovWide {
                            op: MoveWideOp::MovN,
@@ -194,8 +215,7 @@ impl Inst {
                            size,
                        });
                    } else {
-                            let imm =
-                                MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
                        insts.push(Inst::MovWide {
                            op: MoveWideOp::MovZ,
                            rd,
@@ -203,26 +223,23 @@ impl Inst {
                            size,
                        });
                    }
-                    } else {
-                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                        insts.push(Inst::MovK {
-                            rd,
-                            rn: rd.to_reg(), // Redef the same virtual register.
-                            imm,
-                            size,
-                        });
-                    }
-                }
                }

-            assert!(first_mov_emitted);
+                prev_result = Some(rd.to_reg());
+            }
+
+            assert!(prev_result.is_some());

            insts
        }
    }

    /// Create instructions that load a 128-bit constant.
-    pub fn load_constant128(to_regs: ValueRegs<Writable<Reg>>, value: u128) -> SmallVec<[Inst; 4]> {
+    pub fn load_constant128<F: FnMut(Type) -> Writable<Reg>>(
+        to_regs: ValueRegs<Writable<Reg>>,
+        value: u128,
+        mut alloc_tmp: F,
+    ) -> SmallVec<[Inst; 4]> {
        assert_eq!(to_regs.len(), 2, "Expected to load i128 into two registers");

        let lower = value as u64;
@@ -231,8 +248,8 @@ impl Inst {
        let lower_reg = to_regs.regs()[0];
        let upper_reg = to_regs.regs()[1];

-        let mut load_ins = Inst::load_constant(lower_reg, lower);
-        let load_upper = Inst::load_constant(upper_reg, upper);
+        let mut load_ins = Inst::load_constant(lower_reg, lower, &mut alloc_tmp);
+        let load_upper = Inst::load_constant(upper_reg, upper, &mut alloc_tmp);

        load_ins.extend(load_upper.into_iter());
        load_ins
@@ -264,7 +281,7 @@ impl Inst {
            }]
        } else {
            let tmp = alloc_tmp(I32);
-            let mut insts = Inst::load_constant(tmp, const_data as u64);
+            let mut insts = Inst::load_constant(tmp, const_data as u64, &mut alloc_tmp);

            insts.push(Inst::MovToFpu {
                rd,
@@ -304,7 +321,7 @@ impl Inst {
            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
        } else if const_data & (u32::MAX as u64) == 0 {
            let tmp = alloc_tmp(I64);
-            let mut insts = Inst::load_constant(tmp, const_data);
+            let mut insts = Inst::load_constant(tmp, const_data, &mut alloc_tmp);

            insts.push(Inst::MovToFpu {
                rd,
@@ -426,7 +443,7 @@ impl Inst {
            smallvec![Inst::VecDupFPImm { rd, imm, size }]
        } else {
            let tmp = alloc_tmp(I64);
-            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
+            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern, &mut alloc_tmp)[..]);

            insts.push(Inst::VecDup {
                rd,
@@ -1212,14 +1229,16 @@ impl MachInst for Inst {
        to_regs: ValueRegs<Writable<Reg>>,
        value: u128,
        ty: Type,
-        alloc_tmp: F,
+        mut alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
        let to_reg = to_regs.only_reg();
        match ty {
            F64 => Inst::load_fp_constant64(to_reg.unwrap(), value as u64, alloc_tmp),
            F32 => Inst::load_fp_constant32(to_reg.unwrap(), value as u32, alloc_tmp),
-            I8 | I16 | I32 | I64 | R32 | R64 => Inst::load_constant(to_reg.unwrap(), value as u64),
-            I128 => Inst::load_constant128(to_regs, value),
+            I8 | I16 | I32 | I64 | R32 | R64 => {
+                Inst::load_constant(to_reg.unwrap(), value as u64, &mut alloc_tmp)
+            }
+            I128 => Inst::load_constant128(to_regs, value, alloc_tmp),
            _ => panic!("Cannot generate constant for type: {}", ty),
        }
    }
@@ -2837,7 +2856,7 @@ impl Inst {
                    );
                } else {
                    let tmp = writable_spilltmp_reg();
-                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
+                    for inst in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
                        ret.push_str(
                            &inst.print_with_state(&mut EmitState::default(), &mut empty_allocs),
                        );
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -575,7 +575,7 @@ fn lower_add_immediate(ctx: &mut Lower<Inst>, dst: Writable<Reg>, src: Reg, imm:
 }

 pub(crate) fn lower_constant_u64(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: u64) {
-    for inst in Inst::load_constant(rd, value) {
+    for inst in Inst::load_constant(rd, value, &mut |ty| ctx.alloc_tmp(ty).only_reg().unwrap()) {
        ctx.emit(inst);
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -3,6 +3,7 @@
 // Pull in the ISLE generated code.
 pub mod generated_code;
 use generated_code::Context;
+use smallvec::SmallVec;

 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
@@ -217,7 +218,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        } else {
            value
        };
-        let rd = self.temp_writable_reg(I64);
        let size = OperandSize::Size64;

        // If the top 32 bits are zero, use 32-bit `mov` operations.
@@ -226,6 +226,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
            let lower_halfword = value as u16;
            let upper_halfword = (value >> 16) as u16;

+            let rd = self.temp_writable_reg(I64);
            if upper_halfword == u16::MAX {
                self.emit(&MInst::MovWide {
                    op: MoveWideOp::MovN,
@@ -242,17 +243,20 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                });

                if upper_halfword != 0 {
+                    let tmp = self.temp_writable_reg(I64);
                    self.emit(&MInst::MovK {
-                        rd,
+                        rd: tmp,
                        rn: rd.to_reg(),
                        imm: MoveWideConst::maybe_with_shift(upper_halfword, 16).unwrap(),
                        size,
                    });
+                    return tmp.to_reg();
                }
-            }
+            };

            return rd.to_reg();
        } else if value == u64::MAX {
+            let rd = self.temp_writable_reg(I64);
            self.emit(&MInst::MovWide {
                op: MoveWideOp::MovN,
                rd,
@@ -265,20 +269,34 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        // If the number of 0xffff half words is greater than the number of 0x0000 half words
        // it is more efficient to use `movn` for the first instruction.
        let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value);
+
        // Either 0xffff or 0x0000 half words can be skipped, depending on the first
        // instruction used.
        let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
-        let mut first_mov_emitted = false;

-        for i in 0..4 {
+        let halfwords: SmallVec<[_; 4]> = (0..4)
+            .filter_map(|i| {
                let imm16 = (value >> (16 * i)) & 0xffff;
-            if imm16 != ignored_halfword {
-                if !first_mov_emitted {
-                    first_mov_emitted = true;
+                if imm16 == ignored_halfword {
+                    None
+                } else {
+                    Some((i, imm16))
+                }
+            })
+            .collect();
+
+        let mut prev_result = None;
+        for (i, imm16) in halfwords {
+            let shift = i * 16;
+            let rd = self.temp_writable_reg(I64);
+
+            if let Some(rn) = prev_result {
+                let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
+                self.emit(&MInst::MovK { rd, rn, imm, size });
+            } else {
                if first_is_inverted {
                    let imm =
-                            MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
-                                .unwrap();
+                        MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift).unwrap();
                    self.emit(&MInst::MovWide {
                        op: MoveWideOp::MovN,
                        rd,
@@ -286,7 +304,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                        size,
                    });
                } else {
-                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
                    self.emit(&MInst::MovWide {
                        op: MoveWideOp::MovZ,
                        rd,
@@ -294,21 +312,14 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                        size,
                    });
                }
-                } else {
-                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
-                    self.emit(&MInst::MovK {
-                        rd,
-                        rn: rd.to_reg(),
-                        imm,
-                        size,
-                    });
-                }
-            }
            }

-        assert!(first_mov_emitted);
+            prev_result = Some(rd.to_reg());
+        }

-        return self.writable_reg_to_reg(rd);
+        assert!(prev_result.is_some());
+
+        return prev_result.unwrap();

        fn count_zero_half_words(mut value: u64) -> usize {
            let mut count = 0;
--- a/cranelift/codegen/src/isa/riscv64/abi.rs
+++ b/cranelift/codegen/src/isa/riscv64/abi.rs
@@ -525,18 +525,18 @@ impl ABIMachineSpec for Riscv64MachineDeps {
        insts
    }

-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
-        tmp: Writable<Reg>,
-        _tmp2: Writable<Reg>,
        size: usize,
+        mut alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]> {
        let mut insts = SmallVec::new();
        let arg0 = Writable::from_reg(x_reg(10));
        let arg1 = Writable::from_reg(x_reg(11));
        let arg2 = Writable::from_reg(x_reg(12));
+        let tmp = alloc_tmp(Self::word_type());
        insts.extend(Inst::load_constant_u64(tmp, size as u64).into_iter());
        insts.push(Inst::Call {
            info: Box::new(CallInfo {
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -752,13 +752,12 @@ impl ABIMachineSpec for S390xMachineDeps {
        unreachable!();
    }

-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        _call_conv: isa::CallConv,
        _dst: Reg,
        _src: Reg,
-        _tmp1: Writable<Reg>,
-        _tmp2: Writable<Reg>,
        _size: usize,
+        _alloc: F,
    ) -> SmallVec<[Self::I; 8]> {
        unimplemented!("StructArgs not implemented for S390X yet");
    }
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -624,18 +624,19 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        insts
    }

-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
-        temp: Writable<Reg>,
-        temp2: Writable<Reg>,
        size: usize,
+        mut alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]> {
        let mut insts = SmallVec::new();
        let arg0 = get_intreg_for_arg(&call_conv, 0, 0).unwrap();
        let arg1 = get_intreg_for_arg(&call_conv, 1, 1).unwrap();
        let arg2 = get_intreg_for_arg(&call_conv, 2, 2).unwrap();
+        let temp = alloc_tmp(Self::word_type());
+        let temp2 = alloc_tmp(Self::word_type());
        insts.extend(
            Inst::gen_constant(ValueRegs::one(temp), size as u128, I64, |_| {
                panic!("tmp should not be needed")
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -567,16 +567,14 @@ pub trait ABIMachineSpec {
    ) -> SmallVec<[Self::I; 2]>;

    /// Generate a memcpy invocation. Used to set up struct
-    /// args. Takes `src`, `dst` as read-only inputs and requires two
-    /// temporaries to generate the call (for the size immediate and
-    /// possibly for the address of `memcpy` itself).
-    fn gen_memcpy(
+    /// args. Takes `src`, `dst` as read-only inputs and passes a temporary
+    /// allocator.
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
-        tmp1: Writable<Reg>,
-        tmp2: Writable<Reg>,
        size: usize,
+        alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]>;

    /// Get the number of spillslots required for the given register-class.
@@ -2152,15 +2150,12 @@ impl<M: ABIMachineSpec> Caller<M> {
                // arg regs.
                let memcpy_call_conv =
                    isa::CallConv::for_libcall(&self.flags, ctx.sigs()[self.sig].call_conv);
-                let tmp1 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
-                let tmp2 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
                for insn in M::gen_memcpy(
                    memcpy_call_conv,
                    dst_ptr.to_reg(),
                    src_ptr,
-                    tmp1,
-                    tmp2,
                    size as usize,
+                    |ty| ctx.alloc_tmp(ty).only_reg().unwrap(),
                )
                .into_iter()
                {
--- a/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
@@ -37,9 +37,9 @@ block0(v0: i32):
 }

 ; block0:
-;   movz w2, #4369
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4369
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, hs
 ;   ret

@@ -51,9 +51,9 @@ block0(v0: i32):
 }

 ; block0:
-;   movz w2, #4368
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4368
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, hs
 ;   ret

@@ -89,9 +89,9 @@ block0(v0: i32):
 }

 ; block0:
-;   movz w2, #4369
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4369
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, ge
 ;   ret

@@ -103,9 +103,9 @@ block0(v0: i32):
 }

 ; block0:
-;   movz w2, #4368
-;   movk w2, w2, #17, LSL #16
-;   subs wzr, w0, w2
+;   movz w3, #4368
+;   movk w3, w3, #17, LSL #16
+;   subs wzr, w0, w3
 ;   cset x0, ge
 ;   ret

--- a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
@@ -14,11 +14,11 @@ block0(v0: i8x16):
 ;   movk x5, x5, #8208, LSL #32
 ;   movk x5, x5, #32832, LSL #48
 ;   dup v16.2d, x5
-;   and v19.16b, v2.16b, v16.16b
-;   ext v21.16b, v19.16b, v19.16b, #8
-;   zip1 v23.16b, v19.16b, v21.16b
-;   addv h25, v23.8h
-;   umov w0, v25.h[0]
+;   and v22.16b, v2.16b, v16.16b
+;   ext v24.16b, v22.16b, v22.16b, #8
+;   zip1 v26.16b, v22.16b, v24.16b
+;   addv h28, v26.8h
+;   umov w0, v28.h[0]
 ;   ret

 function %f2(i8x16) -> i16 {
@@ -34,11 +34,11 @@ block0(v0: i8x16):
 ;   movk x5, x5, #8208, LSL #32
 ;   movk x5, x5, #32832, LSL #48
 ;   dup v16.2d, x5
-;   and v19.16b, v2.16b, v16.16b
-;   ext v21.16b, v19.16b, v19.16b, #8
-;   zip1 v23.16b, v19.16b, v21.16b
-;   addv h25, v23.8h
-;   umov w0, v25.h[0]
+;   and v22.16b, v2.16b, v16.16b
+;   ext v24.16b, v22.16b, v22.16b, #8
+;   zip1 v26.16b, v22.16b, v24.16b
+;   addv h28, v26.8h
+;   umov w0, v28.h[0]
 ;   ret

 function %f3(i16x8) -> i8 {