Allocate temporary intermediates when loading constants on aarch64 (#5366)

As loading constants on aarch64 can take up to 4 instructions, we need to plumb through some additional registers. Rather than pass a fixed list of registers in, pass an allocation function.
2022-12-01 14:29:36 -08:00
parent 03715dda9d
commit d54a27d0ea
11 changed files with 158 additions and 126 deletions
--- a/cranelift/codegen/src/isa/aarch64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -430,7 +430,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
        } else {
            let scratch2 = writable_tmp2_reg();
            assert_ne!(scratch2.to_reg(), from_reg);
-            insts.extend(Inst::load_constant(scratch2, imm.into()));
+            // `gen_add_imm` is only ever called after register allocation has take place, and as a
            // result it's ok to reuse the scratch2 register here. If that changes, we'll need to
            // plumb through a way to allocate temporary virtual registers
            insts.extend(Inst::load_constant(scratch2, imm.into(), &mut |_| scratch2));
            insts.push(Inst::AluRRRExtend {
                alu_op: ALUOp::Add,
                size: OperandSize::Size64,
@@ -515,7 +518,9 @@ impl ABIMachineSpec for AArch64MachineDeps {
            ret.push(adj_inst);
        } else {
            let tmp = writable_spilltmp_reg();
-            let const_inst = Inst::load_constant(tmp, amount);
+            // `gen_sp_reg_adjust` is called after regalloc2, so it's acceptable to reuse `tmp` for
            // intermediates in `load_constant`.
            let const_inst = Inst::load_constant(tmp, amount, &mut |_| tmp);
            let adj_inst = Inst::AluRRRExtend {
                alu_op,
                size: OperandSize::Size64,
@@ -673,8 +678,10 @@ impl ABIMachineSpec for AArch64MachineDeps {
            // itself is not allowed to use the registers.
            let start = writable_spilltmp_reg();
            let end = writable_tmp2_reg();
-            insts.extend(Inst::load_constant(start, 0));
+            // `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse
-            insts.extend(Inst::load_constant(end, frame_size.into()));
+            // `start` and `end` as temporaries in load_constant.
            insts.extend(Inst::load_constant(start, 0, &mut |_| start));
            insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end));
            insts.push(Inst::StackProbeLoop {
                start,
                end: end.to_reg(),
@@ -1019,19 +1026,19 @@ impl ABIMachineSpec for AArch64MachineDeps {
        insts
    }
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
        tmp: Writable<Reg>,
        _tmp2: Writable<Reg>,
        size: usize,
        mut alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]> {
        let mut insts = SmallVec::new();
        let arg0 = writable_xreg(0);
        let arg1 = writable_xreg(1);
        let arg2 = writable_xreg(2);
-        insts.extend(Inst::load_constant(tmp, size as u64).into_iter());
+        let tmp = alloc_tmp(Self::word_type());
        insts.extend(Inst::load_constant(tmp, size as u64, &mut alloc_tmp));
        insts.push(Inst::Call {
            info: Box::new(CallInfo {
                dest: ExternalName::LibCall(LibCall::Memcpy),
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -64,7 +64,7 @@ pub fn mem_finalize(
            } else {
                let tmp = writable_spilltmp_reg();
                (
-                    Inst::load_constant(tmp, off as u64),
+                    Inst::load_constant(tmp, off as u64, &mut |_| tmp),
                    AMode::RegExtended {
                        rn: basereg,
                        rm: tmp.to_reg(),
@@ -3333,7 +3333,7 @@ impl MachInstEmit for Inst {
                    debug_assert!(rd.to_reg() != tmp2_reg());
                    debug_assert!(reg != tmp2_reg());
                    let tmp = writable_tmp2_reg();
-                    for insn in Inst::load_constant(tmp, abs_offset).into_iter() {
+                    for insn in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
                        insn.emit(&[], sink, emit_info, state);
                    }
                    let add = Inst::AluRRR {
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -130,7 +130,11 @@ fn inst_size_test() {
 impl Inst {
    /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
    /// logical immediate, or constant pool).
-    pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
+    pub fn load_constant<F: FnMut(Type) -> Writable<Reg>>(
        rd: Writable<Reg>,
        value: u64,
        alloc_tmp: &mut F,
    ) -> SmallVec<[Inst; 4]> {
        // NB: this is duplicated in `lower/isle.rs` and `inst.isle` right now,
        // if modifications are made here before this is deleted after moving to
        // ISLE then those locations should be updated as well.
@@ -169,60 +173,73 @@ impl Inst {
            } else {
                (4, OperandSize::Size64, !value)
            };
            // If the number of 0xffff half words is greater than the number of 0x0000 half words
            // it is more efficient to use `movn` for the first instruction.
            let first_is_inverted = count_zero_half_words(negated, num_half_words)
                > count_zero_half_words(value, num_half_words);
            // Either 0xffff or 0x0000 half words can be skipped, depending on the first
            // instruction used.
            let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
            let mut first_mov_emitted = false;
-            for i in 0..num_half_words {
+            let halfwords: SmallVec<[_; 4]> = (0..num_half_words)
-                let imm16 = (value >> (16 * i)) & 0xffff;
+                .filter_map(|i| {
-                if imm16 != ignored_halfword {
+                    let imm16 = (value >> (16 * i)) & 0xffff;
-                    if !first_mov_emitted {
+                    if imm16 == ignored_halfword {
-                        first_mov_emitted = true;
+                        None
                        if first_is_inverted {
                            let imm =
                                MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
                                    .unwrap();
                            insts.push(Inst::MovWide {
                                op: MoveWideOp::MovN,
                                rd,
                                imm,
                                size,
                            });
                        } else {
                            let imm =
                                MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
                            insts.push(Inst::MovWide {
                                op: MoveWideOp::MovZ,
                                rd,
                                imm,
                                size,
                            });
                        }
                    } else {
-                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                        Some((i, imm16))
-                        insts.push(Inst::MovK {
+                    }
                })
                .collect();
            let mut prev_result = None;
            let last_index = halfwords.last().unwrap().0;
            for (i, imm16) in halfwords {
                let shift = i * 16;
                let rd = if i == last_index { rd } else { alloc_tmp(I16) };
                if let Some(rn) = prev_result {
                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
                    insts.push(Inst::MovK { rd, rn, imm, size });
                } else {
                    if first_is_inverted {
                        let imm =
                            MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift)
                                .unwrap();
                        insts.push(Inst::MovWide {
                            op: MoveWideOp::MovN,
                            rd,
                            imm,
                            size,
                        });
                    } else {
                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
                        insts.push(Inst::MovWide {
                            op: MoveWideOp::MovZ,
                            rd,
                            rn: rd.to_reg(), // Redef the same virtual register.
                            imm,
                            size,
                        });
                    }
                }
                prev_result = Some(rd.to_reg());
            }
-            assert!(first_mov_emitted);
+            assert!(prev_result.is_some());
            insts
        }
    }
    /// Create instructions that load a 128-bit constant.
-    pub fn load_constant128(to_regs: ValueRegs<Writable<Reg>>, value: u128) -> SmallVec<[Inst; 4]> {
+    pub fn load_constant128<F: FnMut(Type) -> Writable<Reg>>(
        to_regs: ValueRegs<Writable<Reg>>,
        value: u128,
        mut alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
        assert_eq!(to_regs.len(), 2, "Expected to load i128 into two registers");
        let lower = value as u64;
@@ -231,8 +248,8 @@ impl Inst {
        let lower_reg = to_regs.regs()[0];
        let upper_reg = to_regs.regs()[1];
-        let mut load_ins = Inst::load_constant(lower_reg, lower);
+        let mut load_ins = Inst::load_constant(lower_reg, lower, &mut alloc_tmp);
-        let load_upper = Inst::load_constant(upper_reg, upper);
+        let load_upper = Inst::load_constant(upper_reg, upper, &mut alloc_tmp);
        load_ins.extend(load_upper.into_iter());
        load_ins
@@ -264,7 +281,7 @@ impl Inst {
            }]
        } else {
            let tmp = alloc_tmp(I32);
-            let mut insts = Inst::load_constant(tmp, const_data as u64);
+            let mut insts = Inst::load_constant(tmp, const_data as u64, &mut alloc_tmp);
            insts.push(Inst::MovToFpu {
                rd,
@@ -304,7 +321,7 @@ impl Inst {
            Inst::load_fp_constant32(rd, const_data, alloc_tmp)
        } else if const_data & (u32::MAX as u64) == 0 {
            let tmp = alloc_tmp(I64);
-            let mut insts = Inst::load_constant(tmp, const_data);
+            let mut insts = Inst::load_constant(tmp, const_data, &mut alloc_tmp);
            insts.push(Inst::MovToFpu {
                rd,
@@ -426,7 +443,7 @@ impl Inst {
            smallvec![Inst::VecDupFPImm { rd, imm, size }]
        } else {
            let tmp = alloc_tmp(I64);
-            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]);
+            let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern, &mut alloc_tmp)[..]);
            insts.push(Inst::VecDup {
                rd,
@@ -1212,14 +1229,16 @@ impl MachInst for Inst {
        to_regs: ValueRegs<Writable<Reg>>,
        value: u128,
        ty: Type,
-        alloc_tmp: F,
+        mut alloc_tmp: F,
    ) -> SmallVec<[Inst; 4]> {
        let to_reg = to_regs.only_reg();
        match ty {
            F64 => Inst::load_fp_constant64(to_reg.unwrap(), value as u64, alloc_tmp),
            F32 => Inst::load_fp_constant32(to_reg.unwrap(), value as u32, alloc_tmp),
-            I8 | I16 | I32 | I64 | R32 | R64 => Inst::load_constant(to_reg.unwrap(), value as u64),
+            I8 | I16 | I32 | I64 | R32 | R64 => {
-            I128 => Inst::load_constant128(to_regs, value),
+                Inst::load_constant(to_reg.unwrap(), value as u64, &mut alloc_tmp)
            }
            I128 => Inst::load_constant128(to_regs, value, alloc_tmp),
            _ => panic!("Cannot generate constant for type: {}", ty),
        }
    }
@@ -2837,7 +2856,7 @@ impl Inst {
                    );
                } else {
                    let tmp = writable_spilltmp_reg();
-                    for inst in Inst::load_constant(tmp, abs_offset).into_iter() {
+                    for inst in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() {
                        ret.push_str(
                            &inst.print_with_state(&mut EmitState::default(), &mut empty_allocs),
                        );
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -575,7 +575,7 @@ fn lower_add_immediate(ctx: &mut Lower<Inst>, dst: Writable<Reg>, src: Reg, imm:
 }
 pub(crate) fn lower_constant_u64(ctx: &mut Lower<Inst>, rd: Writable<Reg>, value: u64) {
-    for inst in Inst::load_constant(rd, value) {
+    for inst in Inst::load_constant(rd, value, &mut |ty| ctx.alloc_tmp(ty).only_reg().unwrap()) {
        ctx.emit(inst);
    }
 }
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -3,6 +3,7 @@
 // Pull in the ISLE generated code.
 pub mod generated_code;
 use generated_code::Context;
 use smallvec::SmallVec;
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
@@ -217,7 +218,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        } else {
            value
        };
        let rd = self.temp_writable_reg(I64);
        let size = OperandSize::Size64;
        // If the top 32 bits are zero, use 32-bit `mov` operations.
@@ -226,6 +226,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
            let lower_halfword = value as u16;
            let upper_halfword = (value >> 16) as u16;
            let rd = self.temp_writable_reg(I64);
            if upper_halfword == u16::MAX {
                self.emit(&MInst::MovWide {
                    op: MoveWideOp::MovN,
@@ -242,17 +243,20 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
                });
                if upper_halfword != 0 {
                    let tmp = self.temp_writable_reg(I64);
                    self.emit(&MInst::MovK {
-                        rd,
+                        rd: tmp,
                        rn: rd.to_reg(),
                        imm: MoveWideConst::maybe_with_shift(upper_halfword, 16).unwrap(),
                        size,
                    });
                    return tmp.to_reg();
                }
-            }
+            };
            return rd.to_reg();
        } else if value == u64::MAX {
            let rd = self.temp_writable_reg(I64);
            self.emit(&MInst::MovWide {
                op: MoveWideOp::MovN,
                rd,
@@ -265,50 +269,57 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> {
        // If the number of 0xffff half words is greater than the number of 0x0000 half words
        // it is more efficient to use `movn` for the first instruction.
        let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value);
        // Either 0xffff or 0x0000 half words can be skipped, depending on the first
        // instruction used.
        let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
        let mut first_mov_emitted = false;
-        for i in 0..4 {
+        let halfwords: SmallVec<[_; 4]> = (0..4)
-            let imm16 = (value >> (16 * i)) & 0xffff;
+            .filter_map(|i| {
-            if imm16 != ignored_halfword {
+                let imm16 = (value >> (16 * i)) & 0xffff;
-                if !first_mov_emitted {
+                if imm16 == ignored_halfword {
-                    first_mov_emitted = true;
+                    None
                    if first_is_inverted {
                        let imm =
                            MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
                                .unwrap();
                        self.emit(&MInst::MovWide {
                            op: MoveWideOp::MovN,
                            rd,
                            imm,
                            size,
                        });
                    } else {
                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
                        self.emit(&MInst::MovWide {
                            op: MoveWideOp::MovZ,
                            rd,
                            imm,
                            size,
                        });
                    }
                } else {
-                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                    Some((i, imm16))
-                    self.emit(&MInst::MovK {
+                }
            })
            .collect();
        let mut prev_result = None;
        for (i, imm16) in halfwords {
            let shift = i * 16;
            let rd = self.temp_writable_reg(I64);
            if let Some(rn) = prev_result {
                let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
                self.emit(&MInst::MovK { rd, rn, imm, size });
            } else {
                if first_is_inverted {
                    let imm =
                        MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift).unwrap();
                    self.emit(&MInst::MovWide {
                        op: MoveWideOp::MovN,
                        rd,
                        imm,
                        size,
                    });
                } else {
                    let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap();
                    self.emit(&MInst::MovWide {
                        op: MoveWideOp::MovZ,
                        rd,
                        rn: rd.to_reg(),
                        imm,
                        size,
                    });
                }
            }
            prev_result = Some(rd.to_reg());
        }
-        assert!(first_mov_emitted);
+        assert!(prev_result.is_some());
-        return self.writable_reg_to_reg(rd);
+        return prev_result.unwrap();
        fn count_zero_half_words(mut value: u64) -> usize {
            let mut count = 0;
--- a/cranelift/codegen/src/isa/riscv64/abi.rs
+++ b/cranelift/codegen/src/isa/riscv64/abi.rs
@@ -525,18 +525,18 @@ impl ABIMachineSpec for Riscv64MachineDeps {
        insts
    }
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
        tmp: Writable<Reg>,
        _tmp2: Writable<Reg>,
        size: usize,
        mut alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]> {
        let mut insts = SmallVec::new();
        let arg0 = Writable::from_reg(x_reg(10));
        let arg1 = Writable::from_reg(x_reg(11));
        let arg2 = Writable::from_reg(x_reg(12));
        let tmp = alloc_tmp(Self::word_type());
        insts.extend(Inst::load_constant_u64(tmp, size as u64).into_iter());
        insts.push(Inst::Call {
            info: Box::new(CallInfo {
--- a/cranelift/codegen/src/isa/s390x/abi.rs
+++ b/cranelift/codegen/src/isa/s390x/abi.rs
@@ -752,13 +752,12 @@ impl ABIMachineSpec for S390xMachineDeps {
        unreachable!();
    }
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        _call_conv: isa::CallConv,
        _dst: Reg,
        _src: Reg,
        _tmp1: Writable<Reg>,
        _tmp2: Writable<Reg>,
        _size: usize,
        _alloc: F,
    ) -> SmallVec<[Self::I; 8]> {
        unimplemented!("StructArgs not implemented for S390X yet");
    }
--- a/cranelift/codegen/src/isa/x64/abi.rs
+++ b/cranelift/codegen/src/isa/x64/abi.rs
@@ -624,18 +624,19 @@ impl ABIMachineSpec for X64ABIMachineSpec {
        insts
    }
-    fn gen_memcpy(
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
        temp: Writable<Reg>,
        temp2: Writable<Reg>,
        size: usize,
        mut alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]> {
        let mut insts = SmallVec::new();
        let arg0 = get_intreg_for_arg(&call_conv, 0, 0).unwrap();
        let arg1 = get_intreg_for_arg(&call_conv, 1, 1).unwrap();
        let arg2 = get_intreg_for_arg(&call_conv, 2, 2).unwrap();
        let temp = alloc_tmp(Self::word_type());
        let temp2 = alloc_tmp(Self::word_type());
        insts.extend(
            Inst::gen_constant(ValueRegs::one(temp), size as u128, I64, |_| {
                panic!("tmp should not be needed")
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -567,16 +567,14 @@ pub trait ABIMachineSpec {
    ) -> SmallVec<[Self::I; 2]>;
    /// Generate a memcpy invocation. Used to set up struct
-    /// args. Takes `src`, `dst` as read-only inputs and requires two
+    /// args. Takes `src`, `dst` as read-only inputs and passes a temporary
-    /// temporaries to generate the call (for the size immediate and
+    /// allocator.
-    /// possibly for the address of `memcpy` itself).
+    fn gen_memcpy<F: FnMut(Type) -> Writable<Reg>>(
    fn gen_memcpy(
        call_conv: isa::CallConv,
        dst: Reg,
        src: Reg,
        tmp1: Writable<Reg>,
        tmp2: Writable<Reg>,
        size: usize,
        alloc_tmp: F,
    ) -> SmallVec<[Self::I; 8]>;
    /// Get the number of spillslots required for the given register-class.
@@ -2152,15 +2150,12 @@ impl<M: ABIMachineSpec> Caller<M> {
                // arg regs.
                let memcpy_call_conv =
                    isa::CallConv::for_libcall(&self.flags, ctx.sigs()[self.sig].call_conv);
                let tmp1 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
                let tmp2 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap();
                for insn in M::gen_memcpy(
                    memcpy_call_conv,
                    dst_ptr.to_reg(),
                    src_ptr,
                    tmp1,
                    tmp2,
                    size as usize,
                    |ty| ctx.alloc_tmp(ty).only_reg().unwrap(),
                )
                .into_iter()
                {
--- a/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif
@@ -37,9 +37,9 @@ block0(v0: i32):
 }
 ; block0:
-;   movz w2, #4369
+;   movz w3, #4369
-;   movk w2, w2, #17, LSL #16
+;   movk w3, w3, #17, LSL #16
-;   subs wzr, w0, w2
+;   subs wzr, w0, w3
 ;   cset x0, hs
 ;   ret
@@ -51,9 +51,9 @@ block0(v0: i32):
 }
 ; block0:
-;   movz w2, #4368
+;   movz w3, #4368
-;   movk w2, w2, #17, LSL #16
+;   movk w3, w3, #17, LSL #16
-;   subs wzr, w0, w2
+;   subs wzr, w0, w3
 ;   cset x0, hs
 ;   ret
@@ -89,9 +89,9 @@ block0(v0: i32):
 }
 ; block0:
-;   movz w2, #4369
+;   movz w3, #4369
-;   movk w2, w2, #17, LSL #16
+;   movk w3, w3, #17, LSL #16
-;   subs wzr, w0, w2
+;   subs wzr, w0, w3
 ;   cset x0, ge
 ;   ret
@@ -103,9 +103,9 @@ block0(v0: i32):
 }
 ; block0:
-;   movz w2, #4368
+;   movz w3, #4368
-;   movk w2, w2, #17, LSL #16
+;   movk w3, w3, #17, LSL #16
-;   subs wzr, w0, w2
+;   subs wzr, w0, w3
 ;   cset x0, ge
 ;   ret
--- a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif
@@ -14,11 +14,11 @@ block0(v0: i8x16):
 ;   movk x5, x5, #8208, LSL #32
 ;   movk x5, x5, #32832, LSL #48
 ;   dup v16.2d, x5
-;   and v19.16b, v2.16b, v16.16b
+;   and v22.16b, v2.16b, v16.16b
-;   ext v21.16b, v19.16b, v19.16b, #8
+;   ext v24.16b, v22.16b, v22.16b, #8
-;   zip1 v23.16b, v19.16b, v21.16b
+;   zip1 v26.16b, v22.16b, v24.16b
-;   addv h25, v23.8h
+;   addv h28, v26.8h
-;   umov w0, v25.h[0]
+;   umov w0, v28.h[0]
 ;   ret
 function %f2(i8x16) -> i16 {
@@ -34,11 +34,11 @@ block0(v0: i8x16):
 ;   movk x5, x5, #8208, LSL #32
 ;   movk x5, x5, #32832, LSL #48
 ;   dup v16.2d, x5
-;   and v19.16b, v2.16b, v16.16b
+;   and v22.16b, v2.16b, v16.16b
-;   ext v21.16b, v19.16b, v19.16b, #8
+;   ext v24.16b, v22.16b, v22.16b, #8
-;   zip1 v23.16b, v19.16b, v21.16b
+;   zip1 v26.16b, v22.16b, v24.16b
-;   addv h25, v23.8h
+;   addv h28, v26.8h
-;   umov w0, v25.h[0]
+;   umov w0, v28.h[0]
 ;   ret
 function %f3(i16x8) -> i8 {