From d54a27d0ea83090bf1bd46032a2129677f70df0f Mon Sep 17 00:00:00 2001 From: Trevor Elliott Date: Thu, 1 Dec 2022 14:29:36 -0800 Subject: [PATCH] Allocate temporary intermediates when loading constants on aarch64 (#5366) As loading constants on aarch64 can take up to 4 instructions, we need to plumb through some additional registers. Rather than pass a fixed list of registers in, pass an allocation function. --- cranelift/codegen/src/isa/aarch64/abi.rs | 23 ++-- .../codegen/src/isa/aarch64/inst/emit.rs | 4 +- cranelift/codegen/src/isa/aarch64/inst/mod.rs | 101 +++++++++++------- cranelift/codegen/src/isa/aarch64/lower.rs | 2 +- .../codegen/src/isa/aarch64/lower/isle.rs | 77 +++++++------ cranelift/codegen/src/isa/riscv64/abi.rs | 6 +- cranelift/codegen/src/isa/s390x/abi.rs | 5 +- cranelift/codegen/src/isa/x64/abi.rs | 7 +- cranelift/codegen/src/machinst/abi.rs | 15 +-- .../filetests/isa/aarch64/icmp-const.clif | 24 ++--- .../filetests/isa/aarch64/vhigh_bits.clif | 20 ++-- 11 files changed, 158 insertions(+), 126 deletions(-) diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs index d57a804ef7..dedff9cf25 100644 --- a/cranelift/codegen/src/isa/aarch64/abi.rs +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -430,7 +430,10 @@ impl ABIMachineSpec for AArch64MachineDeps { } else { let scratch2 = writable_tmp2_reg(); assert_ne!(scratch2.to_reg(), from_reg); - insts.extend(Inst::load_constant(scratch2, imm.into())); + // `gen_add_imm` is only ever called after register allocation has take place, and as a + // result it's ok to reuse the scratch2 register here. If that changes, we'll need to + // plumb through a way to allocate temporary virtual registers + insts.extend(Inst::load_constant(scratch2, imm.into(), &mut |_| scratch2)); insts.push(Inst::AluRRRExtend { alu_op: ALUOp::Add, size: OperandSize::Size64, @@ -515,7 +518,9 @@ impl ABIMachineSpec for AArch64MachineDeps { ret.push(adj_inst); } else { let tmp = writable_spilltmp_reg(); - let const_inst = Inst::load_constant(tmp, amount); + // `gen_sp_reg_adjust` is called after regalloc2, so it's acceptable to reuse `tmp` for + // intermediates in `load_constant`. + let const_inst = Inst::load_constant(tmp, amount, &mut |_| tmp); let adj_inst = Inst::AluRRRExtend { alu_op, size: OperandSize::Size64, @@ -673,8 +678,10 @@ impl ABIMachineSpec for AArch64MachineDeps { // itself is not allowed to use the registers. let start = writable_spilltmp_reg(); let end = writable_tmp2_reg(); - insts.extend(Inst::load_constant(start, 0)); - insts.extend(Inst::load_constant(end, frame_size.into())); + // `gen_inline_probestack` is called after regalloc2, so it's acceptable to reuse + // `start` and `end` as temporaries in load_constant. + insts.extend(Inst::load_constant(start, 0, &mut |_| start)); + insts.extend(Inst::load_constant(end, frame_size.into(), &mut |_| end)); insts.push(Inst::StackProbeLoop { start, end: end.to_reg(), @@ -1019,19 +1026,19 @@ impl ABIMachineSpec for AArch64MachineDeps { insts } - fn gen_memcpy( + fn gen_memcpy Writable>( call_conv: isa::CallConv, dst: Reg, src: Reg, - tmp: Writable, - _tmp2: Writable, size: usize, + mut alloc_tmp: F, ) -> SmallVec<[Self::I; 8]> { let mut insts = SmallVec::new(); let arg0 = writable_xreg(0); let arg1 = writable_xreg(1); let arg2 = writable_xreg(2); - insts.extend(Inst::load_constant(tmp, size as u64).into_iter()); + let tmp = alloc_tmp(Self::word_type()); + insts.extend(Inst::load_constant(tmp, size as u64, &mut alloc_tmp)); insts.push(Inst::Call { info: Box::new(CallInfo { dest: ExternalName::LibCall(LibCall::Memcpy), diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 0280a4c00f..15ed484dfe 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -64,7 +64,7 @@ pub fn mem_finalize( } else { let tmp = writable_spilltmp_reg(); ( - Inst::load_constant(tmp, off as u64), + Inst::load_constant(tmp, off as u64, &mut |_| tmp), AMode::RegExtended { rn: basereg, rm: tmp.to_reg(), @@ -3333,7 +3333,7 @@ impl MachInstEmit for Inst { debug_assert!(rd.to_reg() != tmp2_reg()); debug_assert!(reg != tmp2_reg()); let tmp = writable_tmp2_reg(); - for insn in Inst::load_constant(tmp, abs_offset).into_iter() { + for insn in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() { insn.emit(&[], sink, emit_info, state); } let add = Inst::AluRRR { diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index 3ed883a494..dca2a479d0 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -130,7 +130,11 @@ fn inst_size_test() { impl Inst { /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN, /// logical immediate, or constant pool). - pub fn load_constant(rd: Writable, value: u64) -> SmallVec<[Inst; 4]> { + pub fn load_constant Writable>( + rd: Writable, + value: u64, + alloc_tmp: &mut F, + ) -> SmallVec<[Inst; 4]> { // NB: this is duplicated in `lower/isle.rs` and `inst.isle` right now, // if modifications are made here before this is deleted after moving to // ISLE then those locations should be updated as well. @@ -169,60 +173,73 @@ impl Inst { } else { (4, OperandSize::Size64, !value) }; + // If the number of 0xffff half words is greater than the number of 0x0000 half words // it is more efficient to use `movn` for the first instruction. let first_is_inverted = count_zero_half_words(negated, num_half_words) > count_zero_half_words(value, num_half_words); + // Either 0xffff or 0x0000 half words can be skipped, depending on the first // instruction used. let ignored_halfword = if first_is_inverted { 0xffff } else { 0 }; - let mut first_mov_emitted = false; - for i in 0..num_half_words { - let imm16 = (value >> (16 * i)) & 0xffff; - if imm16 != ignored_halfword { - if !first_mov_emitted { - first_mov_emitted = true; - if first_is_inverted { - let imm = - MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16) - .unwrap(); - insts.push(Inst::MovWide { - op: MoveWideOp::MovN, - rd, - imm, - size, - }); - } else { - let imm = - MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); - insts.push(Inst::MovWide { - op: MoveWideOp::MovZ, - rd, - imm, - size, - }); - } + let halfwords: SmallVec<[_; 4]> = (0..num_half_words) + .filter_map(|i| { + let imm16 = (value >> (16 * i)) & 0xffff; + if imm16 == ignored_halfword { + None } else { - let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); - insts.push(Inst::MovK { + Some((i, imm16)) + } + }) + .collect(); + + let mut prev_result = None; + let last_index = halfwords.last().unwrap().0; + for (i, imm16) in halfwords { + let shift = i * 16; + let rd = if i == last_index { rd } else { alloc_tmp(I16) }; + + if let Some(rn) = prev_result { + let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap(); + insts.push(Inst::MovK { rd, rn, imm, size }); + } else { + if first_is_inverted { + let imm = + MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift) + .unwrap(); + insts.push(Inst::MovWide { + op: MoveWideOp::MovN, + rd, + imm, + size, + }); + } else { + let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap(); + insts.push(Inst::MovWide { + op: MoveWideOp::MovZ, rd, - rn: rd.to_reg(), // Redef the same virtual register. imm, size, }); } } + + prev_result = Some(rd.to_reg()); } - assert!(first_mov_emitted); + assert!(prev_result.is_some()); insts } } /// Create instructions that load a 128-bit constant. - pub fn load_constant128(to_regs: ValueRegs>, value: u128) -> SmallVec<[Inst; 4]> { + pub fn load_constant128 Writable>( + to_regs: ValueRegs>, + value: u128, + mut alloc_tmp: F, + ) -> SmallVec<[Inst; 4]> { assert_eq!(to_regs.len(), 2, "Expected to load i128 into two registers"); let lower = value as u64; @@ -231,8 +248,8 @@ impl Inst { let lower_reg = to_regs.regs()[0]; let upper_reg = to_regs.regs()[1]; - let mut load_ins = Inst::load_constant(lower_reg, lower); - let load_upper = Inst::load_constant(upper_reg, upper); + let mut load_ins = Inst::load_constant(lower_reg, lower, &mut alloc_tmp); + let load_upper = Inst::load_constant(upper_reg, upper, &mut alloc_tmp); load_ins.extend(load_upper.into_iter()); load_ins @@ -264,7 +281,7 @@ impl Inst { }] } else { let tmp = alloc_tmp(I32); - let mut insts = Inst::load_constant(tmp, const_data as u64); + let mut insts = Inst::load_constant(tmp, const_data as u64, &mut alloc_tmp); insts.push(Inst::MovToFpu { rd, @@ -304,7 +321,7 @@ impl Inst { Inst::load_fp_constant32(rd, const_data, alloc_tmp) } else if const_data & (u32::MAX as u64) == 0 { let tmp = alloc_tmp(I64); - let mut insts = Inst::load_constant(tmp, const_data); + let mut insts = Inst::load_constant(tmp, const_data, &mut alloc_tmp); insts.push(Inst::MovToFpu { rd, @@ -426,7 +443,7 @@ impl Inst { smallvec![Inst::VecDupFPImm { rd, imm, size }] } else { let tmp = alloc_tmp(I64); - let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern)[..]); + let mut insts = SmallVec::from(&Inst::load_constant(tmp, pattern, &mut alloc_tmp)[..]); insts.push(Inst::VecDup { rd, @@ -1212,14 +1229,16 @@ impl MachInst for Inst { to_regs: ValueRegs>, value: u128, ty: Type, - alloc_tmp: F, + mut alloc_tmp: F, ) -> SmallVec<[Inst; 4]> { let to_reg = to_regs.only_reg(); match ty { F64 => Inst::load_fp_constant64(to_reg.unwrap(), value as u64, alloc_tmp), F32 => Inst::load_fp_constant32(to_reg.unwrap(), value as u32, alloc_tmp), - I8 | I16 | I32 | I64 | R32 | R64 => Inst::load_constant(to_reg.unwrap(), value as u64), - I128 => Inst::load_constant128(to_regs, value), + I8 | I16 | I32 | I64 | R32 | R64 => { + Inst::load_constant(to_reg.unwrap(), value as u64, &mut alloc_tmp) + } + I128 => Inst::load_constant128(to_regs, value, alloc_tmp), _ => panic!("Cannot generate constant for type: {}", ty), } } @@ -2837,7 +2856,7 @@ impl Inst { ); } else { let tmp = writable_spilltmp_reg(); - for inst in Inst::load_constant(tmp, abs_offset).into_iter() { + for inst in Inst::load_constant(tmp, abs_offset, &mut |_| tmp).into_iter() { ret.push_str( &inst.print_with_state(&mut EmitState::default(), &mut empty_allocs), ); diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 7ba0249e77..2396bf1889 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -575,7 +575,7 @@ fn lower_add_immediate(ctx: &mut Lower, dst: Writable, src: Reg, imm: } pub(crate) fn lower_constant_u64(ctx: &mut Lower, rd: Writable, value: u64) { - for inst in Inst::load_constant(rd, value) { + for inst in Inst::load_constant(rd, value, &mut |ty| ctx.alloc_tmp(ty).only_reg().unwrap()) { ctx.emit(inst); } } diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index e47e3fc6b1..a9a0f674c3 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -3,6 +3,7 @@ // Pull in the ISLE generated code. pub mod generated_code; use generated_code::Context; +use smallvec::SmallVec; // Types that the generated ISLE code uses via `use super::*`. use super::{ @@ -217,7 +218,6 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { } else { value }; - let rd = self.temp_writable_reg(I64); let size = OperandSize::Size64; // If the top 32 bits are zero, use 32-bit `mov` operations. @@ -226,6 +226,7 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { let lower_halfword = value as u16; let upper_halfword = (value >> 16) as u16; + let rd = self.temp_writable_reg(I64); if upper_halfword == u16::MAX { self.emit(&MInst::MovWide { op: MoveWideOp::MovN, @@ -242,17 +243,20 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { }); if upper_halfword != 0 { + let tmp = self.temp_writable_reg(I64); self.emit(&MInst::MovK { - rd, + rd: tmp, rn: rd.to_reg(), imm: MoveWideConst::maybe_with_shift(upper_halfword, 16).unwrap(), size, }); + return tmp.to_reg(); } - } + }; return rd.to_reg(); } else if value == u64::MAX { + let rd = self.temp_writable_reg(I64); self.emit(&MInst::MovWide { op: MoveWideOp::MovN, rd, @@ -265,50 +269,57 @@ impl Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> { // If the number of 0xffff half words is greater than the number of 0x0000 half words // it is more efficient to use `movn` for the first instruction. let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value); + // Either 0xffff or 0x0000 half words can be skipped, depending on the first // instruction used. let ignored_halfword = if first_is_inverted { 0xffff } else { 0 }; - let mut first_mov_emitted = false; - for i in 0..4 { - let imm16 = (value >> (16 * i)) & 0xffff; - if imm16 != ignored_halfword { - if !first_mov_emitted { - first_mov_emitted = true; - if first_is_inverted { - let imm = - MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16) - .unwrap(); - self.emit(&MInst::MovWide { - op: MoveWideOp::MovN, - rd, - imm, - size, - }); - } else { - let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); - self.emit(&MInst::MovWide { - op: MoveWideOp::MovZ, - rd, - imm, - size, - }); - } + let halfwords: SmallVec<[_; 4]> = (0..4) + .filter_map(|i| { + let imm16 = (value >> (16 * i)) & 0xffff; + if imm16 == ignored_halfword { + None } else { - let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); - self.emit(&MInst::MovK { + Some((i, imm16)) + } + }) + .collect(); + + let mut prev_result = None; + for (i, imm16) in halfwords { + let shift = i * 16; + let rd = self.temp_writable_reg(I64); + + if let Some(rn) = prev_result { + let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap(); + self.emit(&MInst::MovK { rd, rn, imm, size }); + } else { + if first_is_inverted { + let imm = + MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, shift).unwrap(); + self.emit(&MInst::MovWide { + op: MoveWideOp::MovN, + rd, + imm, + size, + }); + } else { + let imm = MoveWideConst::maybe_with_shift(imm16 as u16, shift).unwrap(); + self.emit(&MInst::MovWide { + op: MoveWideOp::MovZ, rd, - rn: rd.to_reg(), imm, size, }); } } + + prev_result = Some(rd.to_reg()); } - assert!(first_mov_emitted); + assert!(prev_result.is_some()); - return self.writable_reg_to_reg(rd); + return prev_result.unwrap(); fn count_zero_half_words(mut value: u64) -> usize { let mut count = 0; diff --git a/cranelift/codegen/src/isa/riscv64/abi.rs b/cranelift/codegen/src/isa/riscv64/abi.rs index 995a52fbc0..d3d0cf1032 100644 --- a/cranelift/codegen/src/isa/riscv64/abi.rs +++ b/cranelift/codegen/src/isa/riscv64/abi.rs @@ -525,18 +525,18 @@ impl ABIMachineSpec for Riscv64MachineDeps { insts } - fn gen_memcpy( + fn gen_memcpy Writable>( call_conv: isa::CallConv, dst: Reg, src: Reg, - tmp: Writable, - _tmp2: Writable, size: usize, + mut alloc_tmp: F, ) -> SmallVec<[Self::I; 8]> { let mut insts = SmallVec::new(); let arg0 = Writable::from_reg(x_reg(10)); let arg1 = Writable::from_reg(x_reg(11)); let arg2 = Writable::from_reg(x_reg(12)); + let tmp = alloc_tmp(Self::word_type()); insts.extend(Inst::load_constant_u64(tmp, size as u64).into_iter()); insts.push(Inst::Call { info: Box::new(CallInfo { diff --git a/cranelift/codegen/src/isa/s390x/abi.rs b/cranelift/codegen/src/isa/s390x/abi.rs index f123a48966..27c3a94ed6 100644 --- a/cranelift/codegen/src/isa/s390x/abi.rs +++ b/cranelift/codegen/src/isa/s390x/abi.rs @@ -752,13 +752,12 @@ impl ABIMachineSpec for S390xMachineDeps { unreachable!(); } - fn gen_memcpy( + fn gen_memcpy Writable>( _call_conv: isa::CallConv, _dst: Reg, _src: Reg, - _tmp1: Writable, - _tmp2: Writable, _size: usize, + _alloc: F, ) -> SmallVec<[Self::I; 8]> { unimplemented!("StructArgs not implemented for S390X yet"); } diff --git a/cranelift/codegen/src/isa/x64/abi.rs b/cranelift/codegen/src/isa/x64/abi.rs index d2b9d6e18e..bab5bc8ac1 100644 --- a/cranelift/codegen/src/isa/x64/abi.rs +++ b/cranelift/codegen/src/isa/x64/abi.rs @@ -624,18 +624,19 @@ impl ABIMachineSpec for X64ABIMachineSpec { insts } - fn gen_memcpy( + fn gen_memcpy Writable>( call_conv: isa::CallConv, dst: Reg, src: Reg, - temp: Writable, - temp2: Writable, size: usize, + mut alloc_tmp: F, ) -> SmallVec<[Self::I; 8]> { let mut insts = SmallVec::new(); let arg0 = get_intreg_for_arg(&call_conv, 0, 0).unwrap(); let arg1 = get_intreg_for_arg(&call_conv, 1, 1).unwrap(); let arg2 = get_intreg_for_arg(&call_conv, 2, 2).unwrap(); + let temp = alloc_tmp(Self::word_type()); + let temp2 = alloc_tmp(Self::word_type()); insts.extend( Inst::gen_constant(ValueRegs::one(temp), size as u128, I64, |_| { panic!("tmp should not be needed") diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs index b82ce632e1..1d94d4ab9a 100644 --- a/cranelift/codegen/src/machinst/abi.rs +++ b/cranelift/codegen/src/machinst/abi.rs @@ -567,16 +567,14 @@ pub trait ABIMachineSpec { ) -> SmallVec<[Self::I; 2]>; /// Generate a memcpy invocation. Used to set up struct - /// args. Takes `src`, `dst` as read-only inputs and requires two - /// temporaries to generate the call (for the size immediate and - /// possibly for the address of `memcpy` itself). - fn gen_memcpy( + /// args. Takes `src`, `dst` as read-only inputs and passes a temporary + /// allocator. + fn gen_memcpy Writable>( call_conv: isa::CallConv, dst: Reg, src: Reg, - tmp1: Writable, - tmp2: Writable, size: usize, + alloc_tmp: F, ) -> SmallVec<[Self::I; 8]>; /// Get the number of spillslots required for the given register-class. @@ -2152,15 +2150,12 @@ impl Caller { // arg regs. let memcpy_call_conv = isa::CallConv::for_libcall(&self.flags, ctx.sigs()[self.sig].call_conv); - let tmp1 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap(); - let tmp2 = ctx.alloc_tmp(M::word_type()).only_reg().unwrap(); for insn in M::gen_memcpy( memcpy_call_conv, dst_ptr.to_reg(), src_ptr, - tmp1, - tmp2, size as usize, + |ty| ctx.alloc_tmp(ty).only_reg().unwrap(), ) .into_iter() { diff --git a/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif b/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif index d48e8c5019..9911422450 100644 --- a/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif +++ b/cranelift/filetests/filetests/isa/aarch64/icmp-const.clif @@ -37,9 +37,9 @@ block0(v0: i32): } ; block0: -; movz w2, #4369 -; movk w2, w2, #17, LSL #16 -; subs wzr, w0, w2 +; movz w3, #4369 +; movk w3, w3, #17, LSL #16 +; subs wzr, w0, w3 ; cset x0, hs ; ret @@ -51,9 +51,9 @@ block0(v0: i32): } ; block0: -; movz w2, #4368 -; movk w2, w2, #17, LSL #16 -; subs wzr, w0, w2 +; movz w3, #4368 +; movk w3, w3, #17, LSL #16 +; subs wzr, w0, w3 ; cset x0, hs ; ret @@ -89,9 +89,9 @@ block0(v0: i32): } ; block0: -; movz w2, #4369 -; movk w2, w2, #17, LSL #16 -; subs wzr, w0, w2 +; movz w3, #4369 +; movk w3, w3, #17, LSL #16 +; subs wzr, w0, w3 ; cset x0, ge ; ret @@ -103,9 +103,9 @@ block0(v0: i32): } ; block0: -; movz w2, #4368 -; movk w2, w2, #17, LSL #16 -; subs wzr, w0, w2 +; movz w3, #4368 +; movk w3, w3, #17, LSL #16 +; subs wzr, w0, w3 ; cset x0, ge ; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif index 8f603a919e..970a31004c 100644 --- a/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif +++ b/cranelift/filetests/filetests/isa/aarch64/vhigh_bits.clif @@ -14,11 +14,11 @@ block0(v0: i8x16): ; movk x5, x5, #8208, LSL #32 ; movk x5, x5, #32832, LSL #48 ; dup v16.2d, x5 -; and v19.16b, v2.16b, v16.16b -; ext v21.16b, v19.16b, v19.16b, #8 -; zip1 v23.16b, v19.16b, v21.16b -; addv h25, v23.8h -; umov w0, v25.h[0] +; and v22.16b, v2.16b, v16.16b +; ext v24.16b, v22.16b, v22.16b, #8 +; zip1 v26.16b, v22.16b, v24.16b +; addv h28, v26.8h +; umov w0, v28.h[0] ; ret function %f2(i8x16) -> i16 { @@ -34,11 +34,11 @@ block0(v0: i8x16): ; movk x5, x5, #8208, LSL #32 ; movk x5, x5, #32832, LSL #48 ; dup v16.2d, x5 -; and v19.16b, v2.16b, v16.16b -; ext v21.16b, v19.16b, v19.16b, #8 -; zip1 v23.16b, v19.16b, v21.16b -; addv h25, v23.8h -; umov w0, v25.h[0] +; and v22.16b, v2.16b, v16.16b +; ext v24.16b, v22.16b, v22.16b, #8 +; zip1 v26.16b, v22.16b, v24.16b +; addv h28, v26.8h +; umov w0, v28.h[0] ; ret function %f3(i16x8) -> i8 {