Aarch64 codegen quality: support more general add+extend computations.
Previously, our pattern-matching for generating load/store addresses was
somewhat limited. For example, it could not use a register-extend
address mode to handle the following CLIF:
```
v2760 = uextend.i64 v985
v2761 = load.i64 notrap aligned readonly v1
v1018 = iadd v2761, v2760
store v1017, v1018
```
This PR adds more general support for address expressions made up of
additions and extensions. In particular, it pattern-matches a tree of
64-bit `iadd`s, optionally with `uextend`/`sextend` from 32-bit values
at the leaves, to collect the list of all addends that form the address.
It also collects all offsets at leaves, combining them.
It applies a series of heuristics to make the best use of the
available addressing modes, filling the load/store itself with as many
64-bit registers, zero/sign-extended 32-bit registers, and/or an offset,
then computing the rest with add instructions as necessary. It attempts
to make use of immediate forms (add-immediate or subtract-immediate)
whenever possible, and also uses the built-in extend operators on add
instructions when possible. There are certainly cases where this is not
optimal (i.e., does not generate the strictly shortest sequence of
instructions), but it should be good enough for most code.
Using `perf stat` to measure instruction count (runtime only, on
wasmtime, after populating the cache to avoid measuring compilation),
this impacts `bz2` as follows:
```
pre:
1006.410425 task-clock (msec) # 1.000 CPUs utilized
113 context-switches # 0.112 K/sec
1 cpu-migrations # 0.001 K/sec
5,036 page-faults # 0.005 M/sec
3,221,547,476 cycles # 3.201 GHz
4,000,670,104 instructions # 1.24 insn per cycle
<not supported> branches
27,958,613 branch-misses
1.006071348 seconds time elapsed
post:
963.499525 task-clock (msec) # 0.997 CPUs utilized
117 context-switches # 0.121 K/sec
0 cpu-migrations # 0.000 K/sec
5,081 page-faults # 0.005 M/sec
3,039,687,673 cycles # 3.155 GHz
3,837,761,690 instructions # 1.26 insn per cycle
<not supported> branches
28,254,585 branch-misses
0.966072682 seconds time elapsed
```
In other words, this reduces instruction count by 4.1% on `bz2`.
This commit is contained in:
@@ -2,9 +2,8 @@
|
|||||||
//!
|
//!
|
||||||
//! TODO: opportunities for better code generation:
|
//! TODO: opportunities for better code generation:
|
||||||
//!
|
//!
|
||||||
//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns; recognize
|
//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns. Recognize
|
||||||
//! and incorporate sign/zero extension on indices. Recognize pre/post-index
|
//! pre/post-index opportunities.
|
||||||
//! opportunities.
|
|
||||||
//!
|
//!
|
||||||
//! - Floating-point immediates (FIMM instruction).
|
//! - Floating-point immediates (FIMM instruction).
|
||||||
|
|
||||||
@@ -21,8 +20,9 @@ use crate::isa::aarch64::AArch64Backend;
|
|||||||
|
|
||||||
use super::lower_inst;
|
use super::lower_inst;
|
||||||
|
|
||||||
use log::debug;
|
use log::{debug, trace};
|
||||||
use regalloc::{Reg, RegClass, Writable};
|
use regalloc::{Reg, RegClass, Writable};
|
||||||
|
use smallvec::SmallVec;
|
||||||
|
|
||||||
//============================================================================
|
//============================================================================
|
||||||
// Result enum types.
|
// Result enum types.
|
||||||
@@ -544,105 +544,251 @@ pub(crate) fn alu_inst_immshift(
|
|||||||
// Lowering: addressing mode support. Takes instruction directly, rather
|
// Lowering: addressing mode support. Takes instruction directly, rather
|
||||||
// than an `InsnInput`, to do more introspection.
|
// than an `InsnInput`, to do more introspection.
|
||||||
|
|
||||||
|
/// 32-bit addends that make up an address: an input, and an extension mode on that
|
||||||
|
/// input.
|
||||||
|
type AddressAddend32List = SmallVec<[(Reg, ExtendOp); 4]>;
|
||||||
|
/// 64-bit addends that make up an address: just an input.
|
||||||
|
type AddressAddend64List = SmallVec<[Reg; 4]>;
|
||||||
|
|
||||||
|
/// Collect all addends that feed into an address computation, with extend-modes
|
||||||
|
/// on each. Note that a load/store may have multiple address components (and
|
||||||
|
/// the CLIF semantics are that these components are added to form the final
|
||||||
|
/// address), but sometimes the CLIF that we receive still has arguments that
|
||||||
|
/// refer to `iadd` instructions. We also want to handle uextend/sextend below
|
||||||
|
/// the add(s).
|
||||||
|
///
|
||||||
|
/// We match any 64-bit add (and descend into its inputs), and we match any
|
||||||
|
/// 32-to-64-bit sign or zero extension. The returned addend-list will use
|
||||||
|
/// NarrowValueMode values to indicate how to extend each input:
|
||||||
|
///
|
||||||
|
/// - NarrowValueMode::None: the associated input is 64 bits wide; no extend.
|
||||||
|
/// - NarrowValueMode::SignExtend64: the associated input is 32 bits wide;
|
||||||
|
/// do a sign-extension.
|
||||||
|
/// - NarrowValueMode::ZeroExtend64: the associated input is 32 bits wide;
|
||||||
|
/// do a zero-extension.
|
||||||
|
///
|
||||||
|
/// We do not descend further into the inputs of extensions, because supporting
|
||||||
|
/// (e.g.) a 32-bit add that is later extended would require additional masking
|
||||||
|
/// of high-order bits, which is too complex. So, in essence, we descend any
|
||||||
|
/// number of adds from the roots, collecting all 64-bit address addends; then
|
||||||
|
/// possibly support extensions at these leaves.
|
||||||
|
fn collect_address_addends<C: LowerCtx<I = Inst>>(
|
||||||
|
ctx: &mut C,
|
||||||
|
roots: &[InsnInput],
|
||||||
|
) -> (AddressAddend64List, AddressAddend32List, i64) {
|
||||||
|
let mut result32: AddressAddend32List = SmallVec::new();
|
||||||
|
let mut result64: AddressAddend64List = SmallVec::new();
|
||||||
|
let mut offset: i64 = 0;
|
||||||
|
|
||||||
|
let mut workqueue: SmallVec<[InsnInput; 4]> = roots.iter().cloned().collect();
|
||||||
|
|
||||||
|
while let Some(input) = workqueue.pop() {
|
||||||
|
debug_assert!(ty_bits(ctx.input_ty(input.insn, input.input)) == 64);
|
||||||
|
if let Some((op, insn)) = maybe_input_insn_multi(
|
||||||
|
ctx,
|
||||||
|
input,
|
||||||
|
&[
|
||||||
|
Opcode::Uextend,
|
||||||
|
Opcode::Sextend,
|
||||||
|
Opcode::Iadd,
|
||||||
|
Opcode::Iconst,
|
||||||
|
],
|
||||||
|
) {
|
||||||
|
match op {
|
||||||
|
Opcode::Uextend | Opcode::Sextend if ty_bits(ctx.input_ty(insn, 0)) == 32 => {
|
||||||
|
let extendop = if op == Opcode::Uextend {
|
||||||
|
ExtendOp::UXTW
|
||||||
|
} else {
|
||||||
|
ExtendOp::SXTW
|
||||||
|
};
|
||||||
|
let extendee_input = InsnInput { insn, input: 0 };
|
||||||
|
let reg = put_input_in_reg(ctx, extendee_input, NarrowValueMode::None);
|
||||||
|
result32.push((reg, extendop));
|
||||||
|
}
|
||||||
|
Opcode::Uextend | Opcode::Sextend => {
|
||||||
|
let reg = put_input_in_reg(ctx, input, NarrowValueMode::None);
|
||||||
|
result64.push(reg);
|
||||||
|
}
|
||||||
|
Opcode::Iadd => {
|
||||||
|
for input in 0..ctx.num_inputs(insn) {
|
||||||
|
let addend = InsnInput { insn, input };
|
||||||
|
workqueue.push(addend);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Opcode::Iconst => {
|
||||||
|
let value: i64 = ctx.get_constant(insn).unwrap() as i64;
|
||||||
|
offset += value;
|
||||||
|
}
|
||||||
|
_ => panic!("Unexpected opcode from maybe_input_insn_multi"),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let reg = put_input_in_reg(ctx, input, NarrowValueMode::ZeroExtend64);
|
||||||
|
result64.push(reg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
(result64, result32, offset)
|
||||||
|
}
|
||||||
|
|
||||||
/// Lower the address of a load or store.
|
/// Lower the address of a load or store.
|
||||||
pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
|
pub(crate) fn lower_address<C: LowerCtx<I = Inst>>(
|
||||||
ctx: &mut C,
|
ctx: &mut C,
|
||||||
elem_ty: Type,
|
elem_ty: Type,
|
||||||
addends: &[InsnInput],
|
roots: &[InsnInput],
|
||||||
offset: i32,
|
offset: i32,
|
||||||
) -> MemArg {
|
) -> MemArg {
|
||||||
// TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
|
// TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
|
||||||
// mul instructions (Load/StoreComplex don't include scale factors).
|
// mul instructions (Load/StoreComplex don't include scale factors).
|
||||||
|
|
||||||
// Handle one reg and offset.
|
// Collect addends through an arbitrary tree of 32-to-64-bit sign/zero
|
||||||
if addends.len() == 1 {
|
// extends and addition ops. We update these as we consume address
|
||||||
let reg = put_input_in_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
|
// components, so they represent the remaining addends not yet handled.
|
||||||
return MemArg::RegOffset(reg, offset as i64, elem_ty);
|
let (mut addends64, mut addends32, args_offset) = collect_address_addends(ctx, roots);
|
||||||
}
|
let mut offset = args_offset + (offset as i64);
|
||||||
|
|
||||||
// Handle two regs and a zero offset with built-in extend, if possible.
|
trace!(
|
||||||
if addends.len() == 2 && offset == 0 {
|
"lower_address: addends64 {:?}, addends32 {:?}, offset {}",
|
||||||
// r1, r2 (to be extended), r2_bits, is_signed
|
addends64,
|
||||||
let mut parts: Option<(Reg, Reg, usize, bool)> = None;
|
addends32,
|
||||||
// Handle extension of either first or second addend.
|
offset
|
||||||
for i in 0..2 {
|
|
||||||
if let Some((op, ext_insn)) =
|
|
||||||
maybe_input_insn_multi(ctx, addends[i], &[Opcode::Uextend, Opcode::Sextend])
|
|
||||||
{
|
|
||||||
// Non-extended addend.
|
|
||||||
let r1 = put_input_in_reg(ctx, addends[1 - i], NarrowValueMode::ZeroExtend64);
|
|
||||||
// Extended addend.
|
|
||||||
let r2 = put_input_in_reg(
|
|
||||||
ctx,
|
|
||||||
InsnInput {
|
|
||||||
insn: ext_insn,
|
|
||||||
input: 0,
|
|
||||||
},
|
|
||||||
NarrowValueMode::None,
|
|
||||||
);
|
);
|
||||||
let r2_bits = ty_bits(ctx.input_ty(ext_insn, 0));
|
|
||||||
parts = Some((
|
// First, decide what the `MemArg` will be. Take one extendee and one 64-bit
|
||||||
r1,
|
// reg, or two 64-bit regs, or a 64-bit reg and a 32-bit reg with extension,
|
||||||
r2,
|
// or some other combination as appropriate.
|
||||||
r2_bits,
|
let memarg = if addends64.len() > 0 {
|
||||||
/* is_signed = */ op == Opcode::Sextend,
|
if addends32.len() > 0 {
|
||||||
));
|
let (reg32, extendop) = addends32.pop().unwrap();
|
||||||
break;
|
let reg64 = addends64.pop().unwrap();
|
||||||
|
MemArg::RegExtended(reg64, reg32, extendop)
|
||||||
|
} else if offset > 0 && offset < 0x1000 {
|
||||||
|
let reg64 = addends64.pop().unwrap();
|
||||||
|
let off = offset;
|
||||||
|
offset = 0;
|
||||||
|
MemArg::RegOffset(reg64, off, elem_ty)
|
||||||
|
} else if addends64.len() >= 2 {
|
||||||
|
let reg1 = addends64.pop().unwrap();
|
||||||
|
let reg2 = addends64.pop().unwrap();
|
||||||
|
MemArg::RegReg(reg1, reg2)
|
||||||
|
} else {
|
||||||
|
let reg1 = addends64.pop().unwrap();
|
||||||
|
MemArg::reg(reg1)
|
||||||
}
|
}
|
||||||
|
} else
|
||||||
|
/* addends64.len() == 0 */
|
||||||
|
{
|
||||||
|
if addends32.len() > 0 {
|
||||||
|
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
||||||
|
let (reg1, extendop) = addends32.pop().unwrap();
|
||||||
|
let signed = match extendop {
|
||||||
|
ExtendOp::SXTW => true,
|
||||||
|
ExtendOp::UXTW => false,
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
ctx.emit(Inst::Extend {
|
||||||
|
rd: tmp,
|
||||||
|
rn: reg1,
|
||||||
|
signed,
|
||||||
|
from_bits: 32,
|
||||||
|
to_bits: 64,
|
||||||
|
});
|
||||||
|
if let Some((reg2, extendop)) = addends32.pop() {
|
||||||
|
MemArg::RegExtended(tmp.to_reg(), reg2, extendop)
|
||||||
|
} else {
|
||||||
|
MemArg::reg(tmp.to_reg())
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
/* addends32.len() == 0 */
|
||||||
|
{
|
||||||
|
let off_reg = ctx.alloc_tmp(RegClass::I64, I64);
|
||||||
|
lower_constant_u64(ctx, off_reg, offset as u64);
|
||||||
|
offset = 0;
|
||||||
|
MemArg::reg(off_reg.to_reg())
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// At this point, if we have any remaining components, we need to allocate a
|
||||||
|
// temp, replace one of the registers in the MemArg with the temp, and emit
|
||||||
|
// instructions to add together the remaining components. Return immediately
|
||||||
|
// if this is *not* the case.
|
||||||
|
if offset == 0 && addends32.len() == 0 && addends64.len() == 0 {
|
||||||
|
return memarg;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some((r1, r2, r2_bits, is_signed)) = parts {
|
// Allocate the temp and shoehorn it into the MemArg.
|
||||||
match (r2_bits, is_signed) {
|
|
||||||
(32, false) => {
|
|
||||||
return MemArg::RegExtended(r1, r2, ExtendOp::UXTW);
|
|
||||||
}
|
|
||||||
(32, true) => {
|
|
||||||
return MemArg::RegExtended(r1, r2, ExtendOp::SXTW);
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle two regs and a zero offset in the general case, if possible.
|
|
||||||
if addends.len() == 2 && offset == 0 {
|
|
||||||
let ra = put_input_in_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
|
|
||||||
let rb = put_input_in_reg(ctx, addends[1], NarrowValueMode::ZeroExtend64);
|
|
||||||
return MemArg::reg_plus_reg(ra, rb);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Otherwise, generate add instructions.
|
|
||||||
let addr = ctx.alloc_tmp(RegClass::I64, I64);
|
let addr = ctx.alloc_tmp(RegClass::I64, I64);
|
||||||
|
let (reg, memarg) = match memarg {
|
||||||
|
MemArg::RegExtended(r1, r2, extendop) => {
|
||||||
|
(r1, MemArg::RegExtended(addr.to_reg(), r2, extendop))
|
||||||
|
}
|
||||||
|
MemArg::RegOffset(r, off, ty) => (r, MemArg::RegOffset(addr.to_reg(), off, ty)),
|
||||||
|
MemArg::RegReg(r1, r2) => (r2, MemArg::RegReg(addr.to_reg(), r1)),
|
||||||
|
MemArg::UnsignedOffset(r, imm) => (r, MemArg::UnsignedOffset(addr.to_reg(), imm)),
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
|
||||||
// Get the const into a reg.
|
// If there is any offset, load that first into `addr`, and add the `reg`
|
||||||
lower_constant_u64(ctx, addr.clone(), offset as u64);
|
// that we kicked out of the `MemArg`; otherwise, start with that reg.
|
||||||
|
if offset != 0 {
|
||||||
|
// If we can fit offset or -offset in an imm12, use an add-imm
|
||||||
|
// to combine the reg and offset. Otherwise, load value first then add.
|
||||||
|
if let Some(imm12) = Imm12::maybe_from_u64(offset as u64) {
|
||||||
|
ctx.emit(Inst::AluRRImm12 {
|
||||||
|
alu_op: ALUOp::Add64,
|
||||||
|
rd: addr,
|
||||||
|
rn: reg,
|
||||||
|
imm12,
|
||||||
|
});
|
||||||
|
} else if let Some(imm12) = Imm12::maybe_from_u64(offset.wrapping_neg() as u64) {
|
||||||
|
ctx.emit(Inst::AluRRImm12 {
|
||||||
|
alu_op: ALUOp::Sub64,
|
||||||
|
rd: addr,
|
||||||
|
rn: reg,
|
||||||
|
imm12,
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
lower_constant_u64(ctx, addr, offset as u64);
|
||||||
|
ctx.emit(Inst::AluRRR {
|
||||||
|
alu_op: ALUOp::Add64,
|
||||||
|
rd: addr,
|
||||||
|
rn: addr.to_reg(),
|
||||||
|
rm: reg,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ctx.emit(Inst::gen_move(addr, reg, I64));
|
||||||
|
}
|
||||||
|
|
||||||
// Add each addend to the address.
|
// Now handle reg64 and reg32-extended components.
|
||||||
for addend in addends {
|
for reg in addends64 {
|
||||||
let reg = put_input_in_reg(ctx, *addend, NarrowValueMode::ZeroExtend64);
|
// If the register is the stack reg, we must move it to another reg
|
||||||
|
// before adding it.
|
||||||
// In an addition, the stack register is the zero register, so divert it to another
|
|
||||||
// register just before doing the actual add.
|
|
||||||
let reg = if reg == stack_reg() {
|
let reg = if reg == stack_reg() {
|
||||||
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
let tmp = ctx.alloc_tmp(RegClass::I64, I64);
|
||||||
ctx.emit(Inst::Mov {
|
ctx.emit(Inst::gen_move(tmp, stack_reg(), I64));
|
||||||
rd: tmp,
|
|
||||||
rm: stack_reg(),
|
|
||||||
});
|
|
||||||
tmp.to_reg()
|
tmp.to_reg()
|
||||||
} else {
|
} else {
|
||||||
reg
|
reg
|
||||||
};
|
};
|
||||||
|
|
||||||
ctx.emit(Inst::AluRRR {
|
ctx.emit(Inst::AluRRR {
|
||||||
alu_op: ALUOp::Add64,
|
alu_op: ALUOp::Add64,
|
||||||
rd: addr.clone(),
|
rd: addr,
|
||||||
rn: addr.to_reg(),
|
rn: addr.to_reg(),
|
||||||
rm: reg.clone(),
|
rm: reg,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
for (reg, extendop) in addends32 {
|
||||||
|
assert!(reg != stack_reg());
|
||||||
|
ctx.emit(Inst::AluRRRExtend {
|
||||||
|
alu_op: ALUOp::Add64,
|
||||||
|
rd: addr,
|
||||||
|
rn: addr.to_reg(),
|
||||||
|
rm: reg,
|
||||||
|
extendop,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
MemArg::reg(addr.to_reg())
|
memarg
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
|
pub(crate) fn lower_constant_u64<C: LowerCtx<I = Inst>>(
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i32):
|
|||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|
||||||
function %f1(i64, i32) -> i32 {
|
function %f2(i64, i32) -> i32 {
|
||||||
block0(v0: i64, v1: i32):
|
block0(v0: i64, v1: i32):
|
||||||
v2 = uextend.i64 v1
|
v2 = uextend.i64 v1
|
||||||
v3 = load_complex.i32 v2+v0
|
v3 = load_complex.i32 v2+v0
|
||||||
@@ -29,7 +29,7 @@ block0(v0: i64, v1: i32):
|
|||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|
||||||
function %f1(i64, i32) -> i32 {
|
function %f3(i64, i32) -> i32 {
|
||||||
block0(v0: i64, v1: i32):
|
block0(v0: i64, v1: i32):
|
||||||
v2 = sextend.i64 v1
|
v2 = sextend.i64 v1
|
||||||
v3 = load_complex.i32 v0+v2
|
v3 = load_complex.i32 v0+v2
|
||||||
@@ -43,7 +43,7 @@ block0(v0: i64, v1: i32):
|
|||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|
||||||
function %f1(i64, i32) -> i32 {
|
function %f4(i64, i32) -> i32 {
|
||||||
block0(v0: i64, v1: i32):
|
block0(v0: i64, v1: i32):
|
||||||
v2 = sextend.i64 v1
|
v2 = sextend.i64 v1
|
||||||
v3 = load_complex.i32 v2+v0
|
v3 = load_complex.i32 v2+v0
|
||||||
@@ -56,3 +56,216 @@ block0(v0: i64, v1: i32):
|
|||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f5(i64, i32) -> i32 {
|
||||||
|
block0(v0: i64, v1: i32):
|
||||||
|
v2 = sextend.i64 v1
|
||||||
|
v3 = iadd.i64 v0, v2
|
||||||
|
v4 = load.i32 v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: ldr w0, [x0, w1, SXTW]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f6(i64, i32) -> i32 {
|
||||||
|
block0(v0: i64, v1: i32):
|
||||||
|
v2 = sextend.i64 v1
|
||||||
|
v3 = iadd.i64 v2, v0
|
||||||
|
v4 = load.i32 v3
|
||||||
|
return v4
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: ldr w0, [x0, w1, SXTW]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f7(i32, i32) -> i32 {
|
||||||
|
block0(v0: i32, v1: i32):
|
||||||
|
v2 = uextend.i64 v0
|
||||||
|
v3 = uextend.i64 v1
|
||||||
|
v4 = iadd.i64 v2, v3
|
||||||
|
v5 = load.i32 v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: mov w0, w0
|
||||||
|
; nextln: ldr w0, [x0, w1, UXTW]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f8(i64, i32) -> i32 {
|
||||||
|
block0(v0: i64, v1: i32):
|
||||||
|
v2 = sextend.i64 v1
|
||||||
|
v3 = iconst.i64 32
|
||||||
|
v4 = iadd.i64 v2, v3
|
||||||
|
v5 = iadd.i64 v4, v0
|
||||||
|
v6 = iadd.i64 v5, v5
|
||||||
|
v7 = load.i32 v6+4
|
||||||
|
return v7
|
||||||
|
}
|
||||||
|
|
||||||
|
; v6+4 = 2*v5 = 2*v4 + 2*v0 + 4 = 2*v2 + 2*v3 + 2*v0 + 4
|
||||||
|
; = 2*sextend($x1) + 2*$x0 + 68
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: add x2, x0, #68
|
||||||
|
; nextln: add x0, x2, x0
|
||||||
|
; nextln: add x0, x0, x1, SXTW
|
||||||
|
; nextln: ldr w0, [x0, w1, SXTW]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f9(i64, i64, i64) -> i32 {
|
||||||
|
block0(v0: i64, v1: i64, v2: i64):
|
||||||
|
v3 = iconst.i64 48
|
||||||
|
v4 = iadd.i64 v0, v1
|
||||||
|
v5 = iadd.i64 v4, v2
|
||||||
|
v6 = iadd.i64 v5, v3
|
||||||
|
v7 = load.i32 v6
|
||||||
|
return v7
|
||||||
|
}
|
||||||
|
|
||||||
|
; v6 = $x0 + $x1 + $x2 + 48
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: add x0, x0, x2
|
||||||
|
; nextln: add x0, x0, x1
|
||||||
|
; nextln: ldur w0, [x0, #48]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f10(i64, i64, i64) -> i32 {
|
||||||
|
block0(v0: i64, v1: i64, v2: i64):
|
||||||
|
v3 = iconst.i64 4100
|
||||||
|
v4 = iadd.i64 v0, v1
|
||||||
|
v5 = iadd.i64 v4, v2
|
||||||
|
v6 = iadd.i64 v5, v3
|
||||||
|
v7 = load.i32 v6
|
||||||
|
return v7
|
||||||
|
}
|
||||||
|
|
||||||
|
; v6 = $x0 + $x1 + $x2 + 4100
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: movz x3, #4100
|
||||||
|
; nextln: add x1, x3, x1
|
||||||
|
; nextln: add x1, x1, x2
|
||||||
|
; nextln: ldr w0, [x1, x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f10() -> i32 {
|
||||||
|
block0:
|
||||||
|
v1 = iconst.i64 1234
|
||||||
|
v2 = load.i32 v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; v6 = $x0 + $x1 + $x2 + 48
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: movz x0, #1234
|
||||||
|
; nextln: ldr w0, [x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f11(i64) -> i32 {
|
||||||
|
block0(v0: i64):
|
||||||
|
v1 = iconst.i64 8388608 ; Imm12: 0x800 << 12
|
||||||
|
v2 = iadd.i64 v0, v1
|
||||||
|
v3 = load.i32 v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: add x0, x0, #8388608
|
||||||
|
; nextln: ldr w0, [x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f12(i64) -> i32 {
|
||||||
|
block0(v0: i64):
|
||||||
|
v1 = iconst.i64 -4
|
||||||
|
v2 = iadd.i64 v0, v1
|
||||||
|
v3 = load.i32 v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: sub x0, x0, #4
|
||||||
|
; nextln: ldr w0, [x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f13(i64) -> i32 {
|
||||||
|
block0(v0: i64):
|
||||||
|
v1 = iconst.i64 1000000000
|
||||||
|
v2 = iadd.i64 v0, v1
|
||||||
|
v3 = load.i32 v2
|
||||||
|
return v3
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: movz x1, #51712
|
||||||
|
; nextln: movk x1, #15258, LSL #16
|
||||||
|
; nextln: add x0, x1, x0
|
||||||
|
; nextln: ldr w0, [x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f14(i32) -> i32 {
|
||||||
|
block0(v0: i32):
|
||||||
|
v1 = sextend.i64 v0
|
||||||
|
v2 = load.i32 v1
|
||||||
|
return v2
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: sxtw x0, w0
|
||||||
|
; nextln: ldr w0, [x0]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|
||||||
|
function %f15(i32, i32) -> i32 {
|
||||||
|
block0(v0: i32, v1: i32):
|
||||||
|
v2 = sextend.i64 v0
|
||||||
|
v3 = sextend.i64 v1
|
||||||
|
v4 = iadd.i64 v2, v3
|
||||||
|
v5 = load.i32 v4
|
||||||
|
return v5
|
||||||
|
}
|
||||||
|
|
||||||
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
|
; nextln: mov fp, sp
|
||||||
|
; nextln: sxtw x0, w0
|
||||||
|
; nextln: ldr w0, [x0, w1, SXTW]
|
||||||
|
; nextln: mov sp, fp
|
||||||
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
|
; nextln: ret
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ block0(v0: i64, v1: i32):
|
|||||||
; check: Block 0:
|
; check: Block 0:
|
||||||
; check: stp fp, lr, [sp, #-16]!
|
; check: stp fp, lr, [sp, #-16]!
|
||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: ldur w2, [x0]
|
; nextln: ldr w2, [x0]
|
||||||
; nextln: add w2, w2, #0
|
; nextln: add w2, w2, #0
|
||||||
; nextln: subs wzr, w1, w2
|
; nextln: subs wzr, w1, w2
|
||||||
; nextln: b.ls label1 ; b label2
|
; nextln: b.ls label1 ; b label2
|
||||||
|
|||||||
@@ -92,7 +92,7 @@ block3(v7: r64, v8: r64):
|
|||||||
; nextln: ldur x19, [sp, #32]
|
; nextln: ldur x19, [sp, #32]
|
||||||
; nextln: ldur x20, [sp, #40]
|
; nextln: ldur x20, [sp, #40]
|
||||||
; nextln: add x1, sp, #16
|
; nextln: add x1, sp, #16
|
||||||
; nextln: stur x19, [x1]
|
; nextln: str x19, [x1]
|
||||||
; nextln: and w0, w0, #1
|
; nextln: and w0, w0, #1
|
||||||
; nextln: cbz x0, label1 ; b label3
|
; nextln: cbz x0, label1 ; b label3
|
||||||
; check: Block 1:
|
; check: Block 1:
|
||||||
@@ -108,7 +108,7 @@ block3(v7: r64, v8: r64):
|
|||||||
; nextln: b label5
|
; nextln: b label5
|
||||||
; check: Block 5:
|
; check: Block 5:
|
||||||
; check: add x1, sp, #16
|
; check: add x1, sp, #16
|
||||||
; nextln: ldur x1, [x1]
|
; nextln: ldr x1, [x1]
|
||||||
; nextln: mov x2, x1
|
; nextln: mov x2, x1
|
||||||
; nextln: mov x1, x19
|
; nextln: mov x1, x19
|
||||||
; nextln: ldp x19, x20, [sp], #16
|
; nextln: ldp x19, x20, [sp], #16
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ block0:
|
|||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: sub sp, sp, #16
|
; nextln: sub sp, sp, #16
|
||||||
; nextln: mov x0, sp
|
; nextln: mov x0, sp
|
||||||
; nextln: ldur x0, [x0]
|
; nextln: ldr x0, [x0]
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
@@ -71,7 +71,7 @@ block0:
|
|||||||
; nextln: ldr x16, 8 ; b 12 ; data 100016
|
; nextln: ldr x16, 8 ; b 12 ; data 100016
|
||||||
; nextln: sub sp, sp, x16, UXTX
|
; nextln: sub sp, sp, x16, UXTX
|
||||||
; nextln: mov x0, sp
|
; nextln: mov x0, sp
|
||||||
; nextln: ldur x0, [x0]
|
; nextln: ldr x0, [x0]
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
@@ -89,7 +89,7 @@ block0(v0: i64):
|
|||||||
; nextln: mov fp, sp
|
; nextln: mov fp, sp
|
||||||
; nextln: sub sp, sp, #16
|
; nextln: sub sp, sp, #16
|
||||||
; nextln: mov x1, sp
|
; nextln: mov x1, sp
|
||||||
; nextln: stur x0, [x1]
|
; nextln: str x0, [x1]
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
@@ -109,7 +109,7 @@ block0(v0: i64):
|
|||||||
; nextln: ldr x16, 8 ; b 12 ; data 100016
|
; nextln: ldr x16, 8 ; b 12 ; data 100016
|
||||||
; nextln: sub sp, sp, x16, UXTX
|
; nextln: sub sp, sp, x16, UXTX
|
||||||
; nextln: mov x1, sp
|
; nextln: mov x1, sp
|
||||||
; nextln: stur x0, [x1]
|
; nextln: str x0, [x1]
|
||||||
; nextln: mov sp, fp
|
; nextln: mov sp, fp
|
||||||
; nextln: ldp fp, lr, [sp], #16
|
; nextln: ldp fp, lr, [sp], #16
|
||||||
; nextln: ret
|
; nextln: ret
|
||||||
|
|||||||
Reference in New Issue
Block a user